Merge V8 5.3.332.45.  DO NOT MERGE

Test: Manual

FPIIM-449

Change-Id: Id3254828b068abdea3cb10442e0172a8c9a98e03
(cherry picked from commit 13e2dadd00298019ed862f2b2fc5068bba730bcf)
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_cmath.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_cmath.h
index 9fe3f71..ae7ff2f 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_cmath.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_cmath.h
@@ -26,54 +26,39 @@
 #error "This file is for CUDA compilation only."
 #endif
 
-// CUDA allows using math functions form std:: on device side.  This
-// file provides __device__ overloads for math functions that map to
-// appropriate math functions provided by CUDA headers or to compiler
-// builtins if CUDA does not provide a suitable function.
+// CUDA lets us use various std math functions on the device side.  This file
+// works in concert with __clang_cuda_math_forward_declares.h to make this work.
+//
+// Specifically, the forward-declares header declares __device__ overloads for
+// these functions in the global namespace, then pulls them into namespace std
+// with 'using' statements.  Then this file implements those functions, after
+// the implementations have been pulled in.
+//
+// It's important that we declare the functions in the global namespace and pull
+// them into namespace std with using statements, as opposed to simply declaring
+// these functions in namespace std, because our device functions need to
+// overload the standard library functions, which may be declared in the global
+// namespace or in std, depending on the degree of conformance of the stdlib
+// implementation.  Declaring in the global namespace and pulling into namespace
+// std covers all of the known knowns.
 
 #define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
 
-namespace std {
 __DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
 __DEVICE__ long abs(long __n) { return ::labs(__n); }
-using ::abs;
 __DEVICE__ float abs(float __x) { return ::fabsf(__x); }
 __DEVICE__ double abs(double __x) { return ::fabs(__x); }
 __DEVICE__ float acos(float __x) { return ::acosf(__x); }
-using ::acos;
-using ::acosh;
 __DEVICE__ float asin(float __x) { return ::asinf(__x); }
-using ::asin;
-using ::asinh;
 __DEVICE__ float atan(float __x) { return ::atanf(__x); }
-using ::atan;
 __DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
-using ::atan2;
-using ::atanh;
-using ::cbrt;
 __DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
-using ::ceil;
-using ::copysign;
 __DEVICE__ float cos(float __x) { return ::cosf(__x); }
-using ::cos;
 __DEVICE__ float cosh(float __x) { return ::coshf(__x); }
-using ::cosh;
-using ::erf;
-using ::erfc;
 __DEVICE__ float exp(float __x) { return ::expf(__x); }
-using ::exp;
-using ::exp2;
-using ::expm1;
 __DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
-using ::fabs;
-using ::fdim;
 __DEVICE__ float floor(float __x) { return ::floorf(__x); }
-using ::floor;
-using ::fma;
-using ::fmax;
-using ::fmin;
 __DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
-using ::fmod;
 __DEVICE__ int fpclassify(float __x) {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
@@ -85,9 +70,8 @@
 __DEVICE__ float frexp(float __arg, int *__exp) {
   return ::frexpf(__arg, __exp);
 }
-using ::frexp;
-using ::hypot;
-using ::ilogb;
+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
 __DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
 __DEVICE__ bool isgreater(float __x, float __y) {
@@ -102,8 +86,6 @@
 __DEVICE__ bool isgreaterequal(double __x, double __y) {
   return __builtin_isgreaterequal(__x, __y);
 }
-__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isless(float __x, float __y) {
   return __builtin_isless(__x, __y);
 }
@@ -132,36 +114,18 @@
 __DEVICE__ bool isunordered(double __x, double __y) {
   return __builtin_isunordered(__x, __y);
 }
-using ::labs;
 __DEVICE__ float ldexp(float __arg, int __exp) {
   return ::ldexpf(__arg, __exp);
 }
-using ::ldexp;
-using ::lgamma;
-using ::llabs;
-using ::llrint;
 __DEVICE__ float log(float __x) { return ::logf(__x); }
-using ::log;
 __DEVICE__ float log10(float __x) { return ::log10f(__x); }
-using ::log10;
-using ::log1p;
-using ::log2;
-using ::logb;
-using ::lrint;
-using ::lround;
 __DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
-using ::modf;
-using ::nan;
-using ::nanf;
-using ::nearbyint;
-using ::nextafter;
 __DEVICE__ float nexttoward(float __from, float __to) {
   return __builtin_nexttowardf(__from, __to);
 }
 __DEVICE__ double nexttoward(double __from, double __to) {
   return __builtin_nexttoward(__from, __to);
 }
-using ::pow;
 __DEVICE__ float pow(float __base, float __exp) {
   return ::powf(__base, __exp);
 }
@@ -171,28 +135,13 @@
 __DEVICE__ double pow(double __base, int __iexp) {
   return ::powi(__base, __iexp);
 }
-using ::remainder;
-using ::remquo;
-using ::rint;
-using ::round;
-using ::scalbln;
-using ::scalbn;
 __DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
 __DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
 __DEVICE__ float sin(float __x) { return ::sinf(__x); }
-using ::sin;
 __DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
-using ::sinh;
 __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
-using ::sqrt;
 __DEVICE__ float tan(float __x) { return ::tanf(__x); }
-using ::tan;
 __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
-using ::tanh;
-using ::tgamma;
-using ::trunc;
-
-} // namespace std
 
 #undef __DEVICE__
 
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_math_forward_declares.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_math_forward_declares.h
new file mode 100644
index 0000000..3f2834d
--- /dev/null
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_math_forward_declares.h
@@ -0,0 +1,263 @@
+/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// This file forward-declares of some math functions we (or the CUDA headers)
+// will define later.  We need to do this, and do it before cmath is included,
+// because the standard library may have constexpr math functions.  In the
+// absence of a prior __device__ decl, those constexpr functions may become
+// implicitly host+device.  host+device functions can't be overloaded, so that
+// would preclude the use of our own __device__ overloads for these functions.
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__                                                             \
+  static __inline__ __attribute__((always_inline)) __attribute__((device))
+
+__DEVICE__ double abs(double);
+__DEVICE__ float abs(float);
+__DEVICE__ int abs(int);
+__DEVICE__ long abs(long);
+__DEVICE__ long long abs(long long);
+__DEVICE__ double acos(double);
+__DEVICE__ float acos(float);
+__DEVICE__ double acosh(double);
+__DEVICE__ float acosh(float);
+__DEVICE__ double asin(double);
+__DEVICE__ float asin(float);
+__DEVICE__ double asinh(double);
+__DEVICE__ float asinh(float);
+__DEVICE__ double atan2(double, double);
+__DEVICE__ float atan2(float, float);
+__DEVICE__ double atan(double);
+__DEVICE__ float atan(float);
+__DEVICE__ double atanh(double);
+__DEVICE__ float atanh(float);
+__DEVICE__ double cbrt(double);
+__DEVICE__ float cbrt(float);
+__DEVICE__ double ceil(double);
+__DEVICE__ float ceil(float);
+__DEVICE__ double copysign(double, double);
+__DEVICE__ float copysign(float, float);
+__DEVICE__ double cos(double);
+__DEVICE__ float cos(float);
+__DEVICE__ double cosh(double);
+__DEVICE__ float cosh(float);
+__DEVICE__ double erfc(double);
+__DEVICE__ float erfc(float);
+__DEVICE__ double erf(double);
+__DEVICE__ float erf(float);
+__DEVICE__ double exp2(double);
+__DEVICE__ float exp2(float);
+__DEVICE__ double exp(double);
+__DEVICE__ float exp(float);
+__DEVICE__ double expm1(double);
+__DEVICE__ float expm1(float);
+__DEVICE__ double fabs(double);
+__DEVICE__ float fabs(float);
+__DEVICE__ double fdim(double, double);
+__DEVICE__ float fdim(float, float);
+__DEVICE__ double floor(double);
+__DEVICE__ float floor(float);
+__DEVICE__ double fma(double, double, double);
+__DEVICE__ float fma(float, float, float);
+__DEVICE__ double fmax(double, double);
+__DEVICE__ float fmax(float, float);
+__DEVICE__ double fmin(double, double);
+__DEVICE__ float fmin(float, float);
+__DEVICE__ double fmod(double, double);
+__DEVICE__ float fmod(float, float);
+__DEVICE__ int fpclassify(double);
+__DEVICE__ int fpclassify(float);
+__DEVICE__ double frexp(double, int *);
+__DEVICE__ float frexp(float, int *);
+__DEVICE__ double hypot(double, double);
+__DEVICE__ float hypot(float, float);
+__DEVICE__ int ilogb(double);
+__DEVICE__ int ilogb(float);
+__DEVICE__ bool isfinite(double);
+__DEVICE__ bool isfinite(float);
+__DEVICE__ bool isgreater(double, double);
+__DEVICE__ bool isgreaterequal(double, double);
+__DEVICE__ bool isgreaterequal(float, float);
+__DEVICE__ bool isgreater(float, float);
+__DEVICE__ bool isinf(double);
+__DEVICE__ bool isinf(float);
+__DEVICE__ bool isless(double, double);
+__DEVICE__ bool islessequal(double, double);
+__DEVICE__ bool islessequal(float, float);
+__DEVICE__ bool isless(float, float);
+__DEVICE__ bool islessgreater(double, double);
+__DEVICE__ bool islessgreater(float, float);
+__DEVICE__ bool isnan(double);
+__DEVICE__ bool isnan(float);
+__DEVICE__ bool isnormal(double);
+__DEVICE__ bool isnormal(float);
+__DEVICE__ bool isunordered(double, double);
+__DEVICE__ bool isunordered(float, float);
+__DEVICE__ long labs(long);
+__DEVICE__ double ldexp(double, int);
+__DEVICE__ float ldexp(float, int);
+__DEVICE__ double lgamma(double);
+__DEVICE__ float lgamma(float);
+__DEVICE__ long long llabs(long long);
+__DEVICE__ long long llrint(double);
+__DEVICE__ long long llrint(float);
+__DEVICE__ double log10(double);
+__DEVICE__ float log10(float);
+__DEVICE__ double log1p(double);
+__DEVICE__ float log1p(float);
+__DEVICE__ double log2(double);
+__DEVICE__ float log2(float);
+__DEVICE__ double logb(double);
+__DEVICE__ float logb(float);
+__DEVICE__ double log(double);
+__DEVICE__ float log(float);
+__DEVICE__ long lrint(double);
+__DEVICE__ long lrint(float);
+__DEVICE__ long lround(double);
+__DEVICE__ long lround(float);
+__DEVICE__ double modf(double, double *);
+__DEVICE__ float modf(float, float *);
+__DEVICE__ double nan(const char *);
+__DEVICE__ float nanf(const char *);
+__DEVICE__ double nearbyint(double);
+__DEVICE__ float nearbyint(float);
+__DEVICE__ double nextafter(double, double);
+__DEVICE__ float nextafter(float, float);
+__DEVICE__ double nexttoward(double, double);
+__DEVICE__ float nexttoward(float, float);
+__DEVICE__ double pow(double, double);
+__DEVICE__ double pow(double, int);
+__DEVICE__ float pow(float, float);
+__DEVICE__ float pow(float, int);
+__DEVICE__ double remainder(double, double);
+__DEVICE__ float remainder(float, float);
+__DEVICE__ double remquo(double, double, int *);
+__DEVICE__ float remquo(float, float, int *);
+__DEVICE__ double rint(double);
+__DEVICE__ float rint(float);
+__DEVICE__ double round(double);
+__DEVICE__ float round(float);
+__DEVICE__ double scalbln(double, long);
+__DEVICE__ float scalbln(float, long);
+__DEVICE__ double scalbn(double, int);
+__DEVICE__ float scalbn(float, int);
+__DEVICE__ bool signbit(double);
+__DEVICE__ bool signbit(float);
+__DEVICE__ double sin(double);
+__DEVICE__ float sin(float);
+__DEVICE__ double sinh(double);
+__DEVICE__ float sinh(float);
+__DEVICE__ double sqrt(double);
+__DEVICE__ float sqrt(float);
+__DEVICE__ double tan(double);
+__DEVICE__ float tan(float);
+__DEVICE__ double tanh(double);
+__DEVICE__ float tanh(float);
+__DEVICE__ double tgamma(double);
+__DEVICE__ float tgamma(float);
+__DEVICE__ double trunc(double);
+__DEVICE__ float trunc(float);
+
+namespace std {
+using ::abs;
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isinf;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnan;
+using ::isnormal;
+using ::isunordered;
+using ::labs;
+using ::ldexp;
+using ::lgamma;
+using ::llabs;
+using ::llrint;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::modf;
+using ::nan;
+using ::nanf;
+using ::nearbyint;
+using ::nextafter;
+using ::nexttoward;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+} // namespace std
+
+#pragma pop_macro("__DEVICE__")
+
+#endif
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_runtime_wrapper.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_runtime_wrapper.h
index 95d1f5f..4ad240f 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_runtime_wrapper.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/__clang_cuda_runtime_wrapper.h
@@ -42,6 +42,9 @@
 
 #if defined(__CUDA__) && defined(__clang__)
 
+// Include some forward declares that must come before cmath.
+#include <__clang_cuda_math_forward_declares.h>
+
 // Include some standard headers to avoid CUDA headers including them
 // while some required macros (like __THROW) are in a weird state.
 #include <cmath>
@@ -186,8 +189,21 @@
 // we have to include it and it will in turn include .hpp
 #include "sm_30_intrinsics.h"
 #include "sm_32_intrinsics.hpp"
+
 #undef __MATH_FUNCTIONS_HPP__
+
+// math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
+// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
+// math_function.hpp's ::signbit.  It's guarded by #undef signbit, but that's
+// conditional on __GNUC__.  :)
+#pragma push_macro("signbit")
+#pragma push_macro("__GNUC__")
+#undef __GNUC__
+#define signbit __ignored_cuda_signbit
 #include "math_functions.hpp"
+#pragma pop_macro("__GNUC__")
+#pragma pop_macro("signbit")
+
 #pragma pop_macro("__host__")
 
 #include "texture_indirect_functions.h"
@@ -200,16 +216,6 @@
 #undef __CUDABE__
 #define __CUDACC__
 
-#if defined(__CUDA_ARCH__)
-// We need to emit IR declaration for non-existing __nvvm_reflect() to
-// let backend know that it should be treated as const nothrow
-// function which is what NVVMReflect pass expects to see.
-extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *);
-static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
-  return __nvvm_reflect("NONE");
-}
-#endif
-
 extern "C" {
 // Device-side CUDA system calls.
 // http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/altivec.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/altivec.h
index 0508765..77bc928 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/altivec.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/altivec.h
@@ -10268,21 +10268,41 @@
 
 #ifdef __VSX__
 
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
 static __inline__ vector signed int __ATTRS_o_ai
 vec_vsx_ld(int __a, const vector signed int *__b) {
   return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
 }
 
+static __inline__ vector signed int __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed int *__b) {
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_vsx_ld(int __a, const vector unsigned int *__b) {
   return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
 }
 
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
 static __inline__ vector float __ATTRS_o_ai
 vec_vsx_ld(int __a, const vector float *__b) {
   return (vector float)__builtin_vsx_lxvw4x(__a, __b);
 }
 
+static __inline__ vector float __ATTRS_o_ai vec_vsx_ld(int __a,
+                                                       const float *__b) {
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
 static __inline__ vector signed long long __ATTRS_o_ai
 vec_vsx_ld(int __a, const vector signed long long *__b) {
   return (vector signed long long)__builtin_vsx_lxvd2x(__a, __b);
@@ -10298,6 +10318,16 @@
   return (vector double)__builtin_vsx_lxvd2x(__a, __b);
 }
 
+static __inline__ vector double __ATTRS_o_ai
+vec_vsx_ld(int __a, const double *__b) {
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
 static __inline__ vector signed short __ATTRS_o_ai
 vec_vsx_ld(int __a, const vector signed short *__b) {
   return (vector signed short)__builtin_vsx_lxvw4x(__a, __b);
@@ -10349,6 +10379,21 @@
 
 #ifdef __VSX__
 
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               vector bool int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
 static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
                                                vector signed int *__c) {
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
@@ -10374,6 +10419,11 @@
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
 }
 
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                               float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
 static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed long long __a,
                                                int __b,
                                                vector signed long long *__c) {
@@ -10391,6 +10441,25 @@
   __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
 }
 
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                               double *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               vector bool short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
 static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed short __a, int __b,
                                                vector signed short *__c) {
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
@@ -10412,6 +10481,21 @@
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
 }
 
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               vector bool char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
 static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed char __a, int __b,
                                                vector signed char *__c) {
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
@@ -10433,21 +10517,6 @@
   __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
 }
 
-static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
-                                               vector bool char *__c) {
-  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
-}
-
-static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
-                                               signed char *__c) {
-  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
-}
-
-static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
-                                               unsigned char *__c) {
-  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
-}
-
 #endif
 
 /* vec_xor */
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/ammintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/ammintrin.h
index 4880fd7..8985bb4 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/ammintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/ammintrin.h
@@ -38,9 +38,7 @@
 /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
 /// \endcode
 ///
-/// \code
 /// This intrinsic corresponds to the \c EXTRQ instruction.
-/// \endcode
 ///
 /// \param x
 ///    The value from which bits are extracted.
@@ -49,10 +47,10 @@
 ///    are zero, the length is interpreted as 64.
 /// \param idx
 ///    Bits [5:0] specify the index of the least significant bit; the other
-///    bits are ignored. If the sum of the index and length is greater than
-///    64, the result is undefined. If the length and index are both zero,
-///    bits [63:0] of parameter x are extracted. If the length is zero
-///    but the index is non-zero, the result is undefined.
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter x are extracted. If the length is zero but the index
+///    is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector whose lower 64 bits contain the bits
 ///    extracted from the source operand.
 #define _mm_extracti_si64(x, len, idx) \
@@ -64,20 +62,17 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code
 /// This intrinsic corresponds to the \c EXTRQ instruction.
-/// \endcode
 ///
 /// \param __x
 ///    The value from which bits are extracted.
 /// \param __y
-///    Specifies the index of the least significant bit at [13:8]
-///    and the length at [5:0]; all other bits are ignored.
-///    If bits [5:0] are zero, the length is interpreted as 64.
-///    If the sum of the index and length is greater than 64, the result is
-///    undefined. If the length and index are both zero, bits [63:0] of
-///    parameter __x are extracted. If the length is zero but the index is
-///    non-zero, the result is undefined.
+///    Specifies the index of the least significant bit at [13:8] and the
+///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
+///    length is interpreted as 64. If the sum of the index and length is
+///    greater than 64, the result is undefined. If the length and index are
+///    both zero, bits [63:0] of parameter __x are extracted. If the length is
+///    zero but the index is non-zero, the result is undefined.
 /// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
 ///    from the source operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -86,9 +81,9 @@
   return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
 }
 
-/// \brief Inserts bits of a specified length from the source integer vector
-///    y into the lower 64 bits of the destination integer vector x at the
-///    index idx and of the length len.
+/// \brief Inserts bits of a specified length from the source integer vector y
+///    into the lower 64 bits of the destination integer vector x at the index
+///    idx and of the length len.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -97,9 +92,7 @@
 /// const int idx);
 /// \endcode
 ///
-/// \code
 /// This intrinsic corresponds to the \c INSERTQ instruction.
-/// \endcode
 ///
 /// \param x
 ///    The destination operand where bits will be inserted. The inserted bits
@@ -113,14 +106,14 @@
 ///    are zero, the length is interpreted as 64.
 /// \param idx
 ///    Bits [5:0] specify the index of the least significant bit; the other
-///    bits are ignored. If the sum of the index and length is greater than
-///    64, the result is undefined. If the length and index are both zero,
-///    bits [63:0] of parameter y are inserted into parameter x. If the
-///    length is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits
-///    of destination operand x with the specified bitfields replaced by the
-///    lower bits of source operand y. The upper 64 bits of the return value
-///    are undefined.
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter y are inserted into parameter x. If the length is
+///    zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand x with the specified bitfields replaced by the lower
+///    bits of source operand y. The upper 64 bits of the return value are
+///    undefined.
 
 #define _mm_inserti_si64(x, y, len, idx) \
   ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
@@ -128,14 +121,12 @@
                                     (char)(len), (char)(idx)))
 
 /// \brief Inserts bits of a specified length from the source integer vector
-///    __y into the lower 64 bits of the destination integer vector __x at
-///    the index and of the length specified by __y.
+///    __y into the lower 64 bits of the destination integer vector __x at the
+///    index and of the length specified by __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code
 /// This intrinsic corresponds to the \c INSERTQ instruction.
-/// \endcode
 ///
 /// \param __x
 ///    The destination operand where bits will be inserted. The inserted bits
@@ -145,14 +136,14 @@
 ///    The source operand containing the bits to be extracted. The extracted
 ///    bits are the least significant bits of operand __y with length specified
 ///    by bits [69:64]. These are inserted into the destination at the index
-///    specified by bits [77:72]; all other bits are ignored.
-///    If bits [69:64] are zero, the length is interpreted as 64.
-///    If the sum of the index and length is greater than 64, the result is
-///    undefined. If the length and index are both zero, bits [63:0] of
-///    parameter __y are inserted into parameter __x. If the length
-///    is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits
-///    of destination operand __x with the specified bitfields replaced by the
+///    specified by bits [77:72]; all other bits are ignored. If bits [69:64]
+///    are zero, the length is interpreted as 64. If the sum of the index and
+///    length is greater than 64, the result is undefined. If the length and
+///    index are both zero, bits [63:0] of parameter __y are inserted into
+///    parameter __x. If the length is zero but the index is non-zero, the
+///    result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand __x with the specified bitfields replaced by the
 ///    lower bits of source operand __y. The upper 64 bits of the return value
 ///    are undefined.
 
@@ -168,15 +159,12 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code
 /// This intrinsic corresponds to the \c MOVNTSD instruction.
-/// \endcode
 ///
 /// \param __p
 ///    The 64-bit memory location used to store the register value.
 /// \param __a
-///    The 64-bit double-precision floating-point register value to
-///    be stored.
+///    The 64-bit double-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_sd(double *__p, __m128d __a)
 {
@@ -189,15 +177,12 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code
 /// This intrinsic corresponds to the \c MOVNTSS instruction.
-/// \endcode
 ///
 /// \param __p
 ///    The 32-bit memory location used to store the register value.
 /// \param __a
-///    The 32-bit single-precision floating-point register value to
-///    be stored.
+///    The 32-bit single-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_ss(float *__p, __m128 __a)
 {
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/arm_neon.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/arm_neon.h
index 6b424d3..9a2eee4 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/arm_neon.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/arm_neon.h
@@ -4360,27 +4360,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float16x4_t vcvt_f16_f32(float32x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai float16x4_t vcvt_f16_f32(float32x4_t __p0) {
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai float16x4_t __noswap_vcvt_f16_f32(float32x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 8);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 __ai float32x4_t vcvtq_f32_u32(uint32x4_t __p0) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__p0, 50);
@@ -4445,27 +4424,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x4_t vcvt_f32_f16(float16x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai float32x4_t vcvt_f32_f16(float16x4_t __p0) {
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai float32x4_t __noswap_vcvt_f32_f16(float16x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vcvtq_n_f32_u32(__p0, __p1) __extension__ ({ \
   uint32x4_t __s0 = __p0; \
   float32x4_t __ret; \
@@ -33576,6 +33534,50 @@
 #endif
 
 #endif
+#if (__ARM_FP & 2)
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcvt_f16_f32(float32x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vcvt_f16_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vcvt_f16_f32(float32x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvt_f32_f16(float16x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vcvt_f32_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcvt_f32_f16(float16x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 41);
+  return __ret;
+}
+#endif
+
+#endif
 #if __ARM_ARCH >= 8
 #ifdef __LITTLE_ENDIAN__
 __ai int32x4_t vcvtaq_s32_f32(float32x4_t __p0) {
@@ -40425,7 +40427,7 @@
 #ifdef __LITTLE_ENDIAN__
 __ai float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
   float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmsq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  __ret = vfmaq_f32(__p0, -__p1, __p2);
   return __ret;
 }
 #else
@@ -40434,21 +40436,16 @@
   float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
   float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __noswap_vfmaq_f32(__rev0, -__rev1, __rev2);
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
   return __ret;
 }
-__ai float32x4_t __noswap_vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmsq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
 #endif
 
 #ifdef __LITTLE_ENDIAN__
 __ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
   float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfms_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  __ret = vfma_f32(__p0, -__p1, __p2);
   return __ret;
 }
 #else
@@ -40457,15 +40454,10 @@
   float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
   float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfms_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __noswap_vfma_f32(__rev0, -__rev1, __rev2);
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
   return __ret;
 }
-__ai float32x2_t __noswap_vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfms_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
 #endif
 
 #endif
@@ -47290,6 +47282,11 @@
   __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
   return __ret;
 }
+__ai float64x1_t __noswap_vfma_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
 #endif
 
 #ifdef __LITTLE_ENDIAN__
@@ -47709,7 +47706,7 @@
 #ifdef __LITTLE_ENDIAN__
 __ai float64x2_t vfmsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
   float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vfmsq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  __ret = vfmaq_f64(__p0, -__p1, __p2);
   return __ret;
 }
 #else
@@ -47718,27 +47715,22 @@
   float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
   float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vfmsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __noswap_vfmaq_f64(__rev0, -__rev1, __rev2);
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
   return __ret;
 }
-__ai float64x2_t __noswap_vfmsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vfmsq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
 #endif
 
 #ifdef __LITTLE_ENDIAN__
 __ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
   float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vfms_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  __ret = vfma_f64(__p0, -__p1, __p2);
   return __ret;
 }
 #else
 __ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
   float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vfms_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  __ret = __noswap_vfma_f64(__p0, -__p1, __p2);
   return __ret;
 }
 #endif
@@ -47749,7 +47741,7 @@
   float64_t __s1_88 = __p1_88; \
   float64x1_t __s2_88 = __p2_88; \
   float64_t __ret_88; \
-  __ret_88 = vfmad_lane_f64(__s0_88, __s1_88, -__s2_88, __p3_88); \
+  __ret_88 = vfmad_lane_f64(__s0_88, -__s1_88, __s2_88, __p3_88); \
   __ret_88; \
 })
 #else
@@ -47758,7 +47750,7 @@
   float64_t __s1_89 = __p1_89; \
   float64x1_t __s2_89 = __p2_89; \
   float64_t __ret_89; \
-  __ret_89 = __noswap_vfmad_lane_f64(__s0_89, __s1_89, -__s2_89, __p3_89); \
+  __ret_89 = __noswap_vfmad_lane_f64(__s0_89, -__s1_89, __s2_89, __p3_89); \
   __ret_89; \
 })
 #endif
@@ -47769,7 +47761,7 @@
   float32_t __s1_90 = __p1_90; \
   float32x2_t __s2_90 = __p2_90; \
   float32_t __ret_90; \
-  __ret_90 = vfmas_lane_f32(__s0_90, __s1_90, -__s2_90, __p3_90); \
+  __ret_90 = vfmas_lane_f32(__s0_90, -__s1_90, __s2_90, __p3_90); \
   __ret_90; \
 })
 #else
@@ -47779,7 +47771,7 @@
   float32x2_t __s2_91 = __p2_91; \
   float32x2_t __rev2_91;  __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 1, 0); \
   float32_t __ret_91; \
-  __ret_91 = __noswap_vfmas_lane_f32(__s0_91, __s1_91, -__rev2_91, __p3_91); \
+  __ret_91 = __noswap_vfmas_lane_f32(__s0_91, -__s1_91, __rev2_91, __p3_91); \
   __ret_91; \
 })
 #endif
@@ -47790,7 +47782,7 @@
   float64x2_t __s1_92 = __p1_92; \
   float64x1_t __s2_92 = __p2_92; \
   float64x2_t __ret_92; \
-  __ret_92 = vfmaq_lane_f64(__s0_92, __s1_92, -__s2_92, __p3_92); \
+  __ret_92 = vfmaq_lane_f64(__s0_92, -__s1_92, __s2_92, __p3_92); \
   __ret_92; \
 })
 #else
@@ -47801,7 +47793,7 @@
   float64x2_t __rev0_93;  __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \
   float64x2_t __rev1_93;  __rev1_93 = __builtin_shufflevector(__s1_93, __s1_93, 1, 0); \
   float64x2_t __ret_93; \
-  __ret_93 = __noswap_vfmaq_lane_f64(__rev0_93, __rev1_93, -__s2_93, __p3_93); \
+  __ret_93 = __noswap_vfmaq_lane_f64(__rev0_93, -__rev1_93, __s2_93, __p3_93); \
   __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \
   __ret_93; \
 })
@@ -47813,7 +47805,7 @@
   float32x4_t __s1_94 = __p1_94; \
   float32x2_t __s2_94 = __p2_94; \
   float32x4_t __ret_94; \
-  __ret_94 = vfmaq_lane_f32(__s0_94, __s1_94, -__s2_94, __p3_94); \
+  __ret_94 = vfmaq_lane_f32(__s0_94, -__s1_94, __s2_94, __p3_94); \
   __ret_94; \
 })
 #else
@@ -47825,7 +47817,7 @@
   float32x4_t __rev1_95;  __rev1_95 = __builtin_shufflevector(__s1_95, __s1_95, 3, 2, 1, 0); \
   float32x2_t __rev2_95;  __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 1, 0); \
   float32x4_t __ret_95; \
-  __ret_95 = __noswap_vfmaq_lane_f32(__rev0_95, __rev1_95, -__rev2_95, __p3_95); \
+  __ret_95 = __noswap_vfmaq_lane_f32(__rev0_95, -__rev1_95, __rev2_95, __p3_95); \
   __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 3, 2, 1, 0); \
   __ret_95; \
 })
@@ -47837,7 +47829,7 @@
   float64x1_t __s1_96 = __p1_96; \
   float64x1_t __s2_96 = __p2_96; \
   float64x1_t __ret_96; \
-  __ret_96 = vfma_lane_f64(__s0_96, __s1_96, -__s2_96, __p3_96); \
+  __ret_96 = vfma_lane_f64(__s0_96, -__s1_96, __s2_96, __p3_96); \
   __ret_96; \
 })
 #else
@@ -47846,7 +47838,7 @@
   float64x1_t __s1_97 = __p1_97; \
   float64x1_t __s2_97 = __p2_97; \
   float64x1_t __ret_97; \
-  __ret_97 = __noswap_vfma_lane_f64(__s0_97, __s1_97, -__s2_97, __p3_97); \
+  __ret_97 = __noswap_vfma_lane_f64(__s0_97, -__s1_97, __s2_97, __p3_97); \
   __ret_97; \
 })
 #endif
@@ -47857,7 +47849,7 @@
   float32x2_t __s1_98 = __p1_98; \
   float32x2_t __s2_98 = __p2_98; \
   float32x2_t __ret_98; \
-  __ret_98 = vfma_lane_f32(__s0_98, __s1_98, -__s2_98, __p3_98); \
+  __ret_98 = vfma_lane_f32(__s0_98, -__s1_98, __s2_98, __p3_98); \
   __ret_98; \
 })
 #else
@@ -47869,7 +47861,7 @@
   float32x2_t __rev1_99;  __rev1_99 = __builtin_shufflevector(__s1_99, __s1_99, 1, 0); \
   float32x2_t __rev2_99;  __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 1, 0); \
   float32x2_t __ret_99; \
-  __ret_99 = __noswap_vfma_lane_f32(__rev0_99, __rev1_99, -__rev2_99, __p3_99); \
+  __ret_99 = __noswap_vfma_lane_f32(__rev0_99, -__rev1_99, __rev2_99, __p3_99); \
   __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 1, 0); \
   __ret_99; \
 })
@@ -47881,7 +47873,7 @@
   float64_t __s1_100 = __p1_100; \
   float64x2_t __s2_100 = __p2_100; \
   float64_t __ret_100; \
-  __ret_100 = vfmad_laneq_f64(__s0_100, __s1_100, -__s2_100, __p3_100); \
+  __ret_100 = vfmad_laneq_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \
   __ret_100; \
 })
 #else
@@ -47891,7 +47883,7 @@
   float64x2_t __s2_101 = __p2_101; \
   float64x2_t __rev2_101;  __rev2_101 = __builtin_shufflevector(__s2_101, __s2_101, 1, 0); \
   float64_t __ret_101; \
-  __ret_101 = __noswap_vfmad_laneq_f64(__s0_101, __s1_101, -__rev2_101, __p3_101); \
+  __ret_101 = __noswap_vfmad_laneq_f64(__s0_101, -__s1_101, __rev2_101, __p3_101); \
   __ret_101; \
 })
 #endif
@@ -47902,7 +47894,7 @@
   float32_t __s1_102 = __p1_102; \
   float32x4_t __s2_102 = __p2_102; \
   float32_t __ret_102; \
-  __ret_102 = vfmas_laneq_f32(__s0_102, __s1_102, -__s2_102, __p3_102); \
+  __ret_102 = vfmas_laneq_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \
   __ret_102; \
 })
 #else
@@ -47912,7 +47904,7 @@
   float32x4_t __s2_103 = __p2_103; \
   float32x4_t __rev2_103;  __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 3, 2, 1, 0); \
   float32_t __ret_103; \
-  __ret_103 = __noswap_vfmas_laneq_f32(__s0_103, __s1_103, -__rev2_103, __p3_103); \
+  __ret_103 = __noswap_vfmas_laneq_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \
   __ret_103; \
 })
 #endif
@@ -47923,7 +47915,7 @@
   float64x2_t __s1_104 = __p1_104; \
   float64x2_t __s2_104 = __p2_104; \
   float64x2_t __ret_104; \
-  __ret_104 = vfmaq_laneq_f64(__s0_104, __s1_104, -__s2_104, __p3_104); \
+  __ret_104 = vfmaq_laneq_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \
   __ret_104; \
 })
 #else
@@ -47935,7 +47927,7 @@
   float64x2_t __rev1_105;  __rev1_105 = __builtin_shufflevector(__s1_105, __s1_105, 1, 0); \
   float64x2_t __rev2_105;  __rev2_105 = __builtin_shufflevector(__s2_105, __s2_105, 1, 0); \
   float64x2_t __ret_105; \
-  __ret_105 = __noswap_vfmaq_laneq_f64(__rev0_105, __rev1_105, -__rev2_105, __p3_105); \
+  __ret_105 = __noswap_vfmaq_laneq_f64(__rev0_105, -__rev1_105, __rev2_105, __p3_105); \
   __ret_105 = __builtin_shufflevector(__ret_105, __ret_105, 1, 0); \
   __ret_105; \
 })
@@ -47947,7 +47939,7 @@
   float32x4_t __s1_106 = __p1_106; \
   float32x4_t __s2_106 = __p2_106; \
   float32x4_t __ret_106; \
-  __ret_106 = vfmaq_laneq_f32(__s0_106, __s1_106, -__s2_106, __p3_106); \
+  __ret_106 = vfmaq_laneq_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \
   __ret_106; \
 })
 #else
@@ -47959,7 +47951,7 @@
   float32x4_t __rev1_107;  __rev1_107 = __builtin_shufflevector(__s1_107, __s1_107, 3, 2, 1, 0); \
   float32x4_t __rev2_107;  __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 3, 2, 1, 0); \
   float32x4_t __ret_107; \
-  __ret_107 = __noswap_vfmaq_laneq_f32(__rev0_107, __rev1_107, -__rev2_107, __p3_107); \
+  __ret_107 = __noswap_vfmaq_laneq_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \
   __ret_107 = __builtin_shufflevector(__ret_107, __ret_107, 3, 2, 1, 0); \
   __ret_107; \
 })
@@ -47971,7 +47963,7 @@
   float64x1_t __s1_108 = __p1_108; \
   float64x2_t __s2_108 = __p2_108; \
   float64x1_t __ret_108; \
-  __ret_108 = vfma_laneq_f64(__s0_108, __s1_108, -__s2_108, __p3_108); \
+  __ret_108 = vfma_laneq_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \
   __ret_108; \
 })
 #else
@@ -47981,7 +47973,7 @@
   float64x2_t __s2_109 = __p2_109; \
   float64x2_t __rev2_109;  __rev2_109 = __builtin_shufflevector(__s2_109, __s2_109, 1, 0); \
   float64x1_t __ret_109; \
-  __ret_109 = __noswap_vfma_laneq_f64(__s0_109, __s1_109, -__rev2_109, __p3_109); \
+  __ret_109 = __noswap_vfma_laneq_f64(__s0_109, -__s1_109, __rev2_109, __p3_109); \
   __ret_109; \
 })
 #endif
@@ -47992,7 +47984,7 @@
   float32x2_t __s1_110 = __p1_110; \
   float32x4_t __s2_110 = __p2_110; \
   float32x2_t __ret_110; \
-  __ret_110 = vfma_laneq_f32(__s0_110, __s1_110, -__s2_110, __p3_110); \
+  __ret_110 = vfma_laneq_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \
   __ret_110; \
 })
 #else
@@ -48004,7 +47996,7 @@
   float32x2_t __rev1_111;  __rev1_111 = __builtin_shufflevector(__s1_111, __s1_111, 1, 0); \
   float32x4_t __rev2_111;  __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 3, 2, 1, 0); \
   float32x2_t __ret_111; \
-  __ret_111 = __noswap_vfma_laneq_f32(__rev0_111, __rev1_111, -__rev2_111, __p3_111); \
+  __ret_111 = __noswap_vfma_laneq_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \
   __ret_111 = __builtin_shufflevector(__ret_111, __ret_111, 1, 0); \
   __ret_111; \
 })
@@ -48013,7 +48005,7 @@
 #ifdef __LITTLE_ENDIAN__
 __ai float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
   float64x2_t __ret;
-  __ret = vfmsq_f64(__p0, __p1, (float64x2_t) {__p2, __p2});
+  __ret = vfmaq_f64(__p0, -__p1, (float64x2_t) {__p2, __p2});
   return __ret;
 }
 #else
@@ -48021,7 +48013,7 @@
   float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
   float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   float64x2_t __ret;
-  __ret = __noswap_vfmsq_f64(__rev0, __rev1, (float64x2_t) {__p2, __p2});
+  __ret = __noswap_vfmaq_f64(__rev0, -__rev1, (float64x2_t) {__p2, __p2});
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
   return __ret;
 }
@@ -48030,7 +48022,7 @@
 #ifdef __LITTLE_ENDIAN__
 __ai float32x4_t vfmsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
   float32x4_t __ret;
-  __ret = vfmsq_f32(__p0, __p1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  __ret = vfmaq_f32(__p0, -__p1, (float32x4_t) {__p2, __p2, __p2, __p2});
   return __ret;
 }
 #else
@@ -48038,7 +48030,7 @@
   float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
   float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   float32x4_t __ret;
-  __ret = __noswap_vfmsq_f32(__rev0, __rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __noswap_vfmaq_f32(__rev0, -__rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
   return __ret;
 }
@@ -48047,7 +48039,7 @@
 #ifdef __LITTLE_ENDIAN__
 __ai float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
   float32x2_t __ret;
-  __ret = vfms_f32(__p0, __p1, (float32x2_t) {__p2, __p2});
+  __ret = vfma_f32(__p0, -__p1, (float32x2_t) {__p2, __p2});
   return __ret;
 }
 #else
@@ -48055,7 +48047,7 @@
   float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
   float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   float32x2_t __ret;
-  __ret = __noswap_vfms_f32(__rev0, __rev1, (float32x2_t) {__p2, __p2});
+  __ret = __noswap_vfma_f32(__rev0, -__rev1, (float32x2_t) {__p2, __p2});
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
   return __ret;
 }
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512bwintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512bwintrin.h
index f641c71..11a867a 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512bwintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512bwintrin.h
@@ -1981,6 +1981,255 @@
                  _mm512_setzero_hi (),
                  (__mmask64) __U);
 }
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
+{
+  __builtin_ia32_storedquhi512_mask ((__v32hi *) __P,
+             (__v32hi) __A,
+             (__mmask32) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
+{
+  __builtin_ia32_storedquqi512_mask ((__v64qi *) __P,
+             (__v64qi) __A,
+             (__mmask64) __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_test_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+            (__v64qi) __B,
+            (__mmask64) -1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+            (__v64qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_test_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+            (__v32hi) __B,
+            (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+            (__v32hi) __B, __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+             (__v64qi) __B,
+             (__mmask64) -1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+             (__v64qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+             (__v32hi) __B,
+             (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+             (__v32hi) __B, __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_movepi8_mask (__m512i __A)
+{
+  return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_movepi16_mask (__m512i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi8 (__mmask64 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi16 (__mmask32 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastb_epi8 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+                   (__v64qi) _mm512_setzero_si512(),
+                   (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+                   (__v64qi) __O,
+                   __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+                   (__v64qi) _mm512_setzero_qi(),
+                   __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                 (__v32hi) __O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                 (__v32hi) _mm512_setzero_hi(),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastw_epi16 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+                   (__v32hi) _mm512_setzero_si512(),
+                   (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+                   (__v32hi) __O,
+                   __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+                   (__v32hi) _mm512_setzero_hi(),
+                   __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) _mm512_undefined_epi32 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) __W,
+                 (__mmask32) __M);
+}
+
+#define _mm512_alignr_epi8( __A, __B, __N) __extension__ ({\
+__builtin_ia32_palignr512_mask ((__v8di) __A,\
+                 (__v8di) __B ,__N * 8,\
+                 (__v8di) _mm512_undefined_pd (),\
+                 (__mmask64) -1);\
+})
+
+#define _mm512_mask_alignr_epi8( __W, __U, __A, __B, __N) __extension__({\
+__builtin_ia32_palignr512_mask ((__v8di) __A,\
+                 (__v8di) __B,\
+                 __N * 8,\
+                 (__v8di) __W,\
+                 (__mmask64) __U);\
+})
+
+#define _mm512_maskz_alignr_epi8( __U, __A, __B, __N) __extension__({\
+__builtin_ia32_palignr512_mask ((__v8di) __A,\
+                 (__v8di) __B,\
+                 __N * 8,\
+                 (__v8di) _mm512_setzero_si512 (),\
+                 (__mmask64) __U);\
+})
+
+#define _mm512_dbsad_epu8( __A,  __B, __imm) __extension__ ({\
+__builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,\
+                                (__v64qi) __B,\
+                                __imm,\
+                                (__v32hi) _mm512_undefined_epi32(),\
+                                (__mmask32) -1);\
+})
+
+#define _mm512_mask_dbsad_epu8( __W, __U, __A, __B, __imm) ({\
+__builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,\
+                                (__v64qi) __B,\
+                                __imm,\
+                                (__v32hi) __W,\
+                                (__mmask32) __U);\
+})
+
+#define _mm512_maskz_dbsad_epu8( __U, __A, __B, __imm) ({\
+__builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,\
+                                (__v64qi) __B,\
+                                __imm,\
+                                (__v32hi) _mm512_setzero_hi(),\
+                                (__mmask32) __U);\
+})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sad_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
+               (__v64qi) __B);
+}
+
+
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512cdintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512cdintrin.h
index 3894b29..23c4235 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512cdintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512cdintrin.h
@@ -126,6 +126,19 @@
              (__v8di) _mm512_setzero_si512 (),
              (__mmask8) __U);
 }
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+}
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512dqintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512dqintrin.h
index afee490..ab970f1 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512dqintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512dqintrin.h
@@ -773,6 +773,371 @@
   (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
                (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
 
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_movepi32_mask (__m512i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_movepi64_mask (__m512i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
+}
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x2 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)_mm512_undefined_ps(),
+                (__mmask16) - 1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)
+                __O, __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)_mm512_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x8 (__m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                _mm512_undefined_ps(),
+                (__mmask16) - 1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                (__v16sf)__O,
+                __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                (__v16sf)_mm512_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcast_f64x2 (__m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)_mm512_undefined_pd(),
+                 (__mmask8) - 1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)
+                 __O, __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)_mm512_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x2 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)_mm512_setzero_si512(),
+                 (__mmask16) - 1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)
+                 __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)_mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x8 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)_mm512_setzero_si512(),
+                 (__mmask16) - 1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)__O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i64x2 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)_mm512_setzero_si512(),
+                 (__mmask8) - 1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)
+                 __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)_mm512_setzero_si512 (),
+                 __M);
+}
+
+#define _mm512_extractf32x8_ps( __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf32x8_mask ((__v16sf)( __A),\
+                ( __imm),\
+                (__v8sf) _mm256_setzero_ps (),\
+                (__mmask8) -1);\
+})
+
+#define _mm512_mask_extractf32x8_ps( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf32x8_mask ((__v16sf)( __A),\
+                ( __imm),\
+                (__v8sf)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_extractf32x8_ps( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf32x8_mask ((__v16sf)( __A),\
+                ( __imm),\
+                (__v8sf) _mm256_setzero_ps (),\
+                (__mmask8)( __U));\
+})
+
+#define _mm512_extractf64x2_pd( __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf64x2_512_mask ((__v8df)( __A),\
+               ( __imm),\
+               (__v2df) _mm_setzero_pd (),\
+               (__mmask8) -1);\
+})
+
+#define _mm512_mask_extractf64x2_pd( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf64x2_512_mask ((__v8df)( __A),\
+               ( __imm),\
+               (__v2df)( __W),\
+               (__mmask8) ( __U));\
+})
+
+#define _mm512_maskz_extractf64x2_pd( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf64x2_512_mask ((__v8df)( __A),\
+               ( __imm),\
+               (__v2df) _mm_setzero_pd (),\
+               (__mmask8) ( __U));\
+})
+
+#define _mm512_extracti32x8_epi32( __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x8_mask ((__v16si)( __A),\
+                ( __imm),\
+                (__v8si) _mm256_setzero_si256 (),\
+                (__mmask8) -1);\
+})
+
+#define _mm512_mask_extracti32x8_epi32( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x8_mask ((__v16si)( __A),\
+                ( __imm),\
+                (__v8si)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_extracti32x8_epi32( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x8_mask ((__v16si)( __A),\
+                ( __imm),\
+                (__v8si) _mm256_setzero_si256 (),\
+                (__mmask8)( __U));\
+})
+
+#define _mm512_extracti64x2_epi64( __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x2_512_mask ((__v8di)( __A),\
+                ( __imm),\
+                (__v2di) _mm_setzero_di (),\
+                (__mmask8) -1);\
+})
+
+#define _mm512_mask_extracti64x2_epi64( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x2_512_mask ((__v8di)( __A),\
+                ( __imm),\
+                (__v2di)( __W),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm512_maskz_extracti64x2_epi64( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x2_512_mask ((__v8di)( __A),\
+                ( __imm),\
+                (__v2di) _mm_setzero_di (),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm512_insertf64x2( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x2_512_mask ((__v8df)( __A),\
+                (__v2df)( __B),\
+                ( __imm),\
+                (__v8df) _mm512_setzero_pd (),\
+                (__mmask8) -1);\
+})
+
+#define _mm512_mask_insertf64x2( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x2_512_mask ((__v8df)( __A),\
+                (__v2df)( __B),\
+                ( __imm),\
+                (__v8df)( __W),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm512_maskz_insertf64x2( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x2_512_mask ((__v8df)( __A),\
+                (__v2df)( __B),\
+                ( __imm),\
+                (__v8df) _mm512_setzero_pd (),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm512_inserti32x8( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti32x8_mask ((__v16si)( __A),\
+                (__v8si)( __B),\
+                ( __imm),\
+                (__v16si) _mm512_setzero_si512 (),\
+                (__mmask16) -1);\
+})
+
+#define _mm512_mask_inserti32x8( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti32x8_mask ((__v16si)( __A),\
+                (__v8si)( __B),\
+                ( __imm),\
+                (__v16si)( __W),\
+                (__mmask16)( __U));\
+})
+
+#define _mm512_maskz_inserti32x8( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti32x8_mask ((__v16si)( __A),\
+                (__v8si)( __B),\
+                ( __imm),\
+                (__v16si) _mm512_setzero_si512 (),\
+                (__mmask16)( __U));\
+})
+
+#define _mm512_inserti64x2( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x2_512_mask ((__v8di)( __A),\
+                (__v2di)( __B),\
+                ( __imm),\
+                (__v8di) _mm512_setzero_si512 (),\
+                (__mmask8) -1);\
+})
+
+#define _mm512_mask_inserti64x2( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x2_512_mask ((__v8di)( __A),\
+                (__v2di)( __B),\
+                ( __imm),\
+                (__v8di)( __W),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm512_maskz_inserti64x2( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x2_512_mask ((__v8di)( __A),\
+                (__v2di)( __B),\
+                ( __imm),\
+                (__v8di) _mm512_setzero_si512 (),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm512_mask_fpclass_ps_mask( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclassps512_mask ((__v16sf)( __A),\
+                  ( __imm),\
+                  ( __U));\
+})
+
+#define _mm512_fpclass_ps_mask( __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclassps512_mask ((__v16sf)( __A),\
+                   ( __imm),\
+                   (__mmask16) -1);\
+})
+
+#define _mm512_mask_fpclass_pd_mask( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclasspd512_mask ((__v8df)( __A),\
+                  ( __imm),( __U));\
+})
+
+#define _mm512_fpclass_pd_mask( __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclasspd512_mask ((__v8df)( __A),\
+                  ( __imm),\
+                  (__mmask8) -1);\
+})
+
+#define _mm_fpclass_sd_mask( __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclasssd_mask ((__v2df)( __A), ( __imm), (__mmask8) -1);\
+})
+
+#define _mm_mask_fpclass_sd_mask( __U,__A, __imm) __extension__ ({ \
+__builtin_ia32_fpclasssd_mask ((__v2df)( __A), ( __imm), (__mmask8) __U);\
+})
+
+#define _mm_fpclass_ss_mask( __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclassss_mask ((__v4sf)( __A), ( __imm), (__mmask8) -1);\
+})
+
+#define _mm_mask_fpclass_ss_mask(__U ,__A, __imm) __extension__ ({ \
+__builtin_ia32_fpclassss_mask ((__v4sf)( __A), ( __imm), (__mmask8) __U);\
+})
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512fintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512fintrin.h
index e1f81a0..6500bb5 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512fintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512fintrin.h
@@ -48,6 +48,96 @@
 
 typedef enum
 {
+  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
+  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
+  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
+  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
+  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
+  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
+  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
+  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
+  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
+  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
+  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
+  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
+  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
+  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
+  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
+  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
+  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
+  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
+  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
+  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
+  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
+  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
+  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
+  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
+  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
+  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
+  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
+  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
+  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
+  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
+  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
+  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
+  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
+  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
+  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
+  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
+  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
+  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
+  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
+  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
+  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
+  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
+  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
+  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
+  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
+  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
+  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
+  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
+  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
+  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
+  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
+  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
+  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
+  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
+  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
+  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
+  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
+  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
+  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
+  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
+  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
+  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
+  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
+  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
+  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
+  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
+  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
+  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
+  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
+  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
+  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
+  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
+  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
+  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
+  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
+  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
+  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
+  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
+  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
+  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
+  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
+  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
+  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
+  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
+  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
+  _MM_PERM_DDDD = 0xFF
+} _MM_PERM_ENUM;
+
+typedef enum
+{
   _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
   _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
   _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
@@ -95,6 +185,55 @@
 {
   return (__m512i)__builtin_ia32_undef512();
 }
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastd_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+              (__v16si)
+              _mm512_undefined_epi32 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+              (__v16si) __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastq_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+              (__v8di)
+              _mm512_undefined_pd (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+              (__v8di) __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              __M);
+}
 
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
@@ -204,6 +343,31 @@
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
 }
 
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_castpd128_pd512 (__m128d __A)
+{
+  return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_castps128_ps512 (__m128 __A)
+{
+    return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_castsi128_si512 (__m128i __A)
+{
+   return  __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_castsi256_si512 (__m256i __A)
+{
+   return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
 /* Bitwise operators */
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_epi32(__m512i __a, __m512i __b)
@@ -931,6 +1095,24 @@
              (__mmask8) -1);
 }
 
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_ps (),
+          (__mmask8) __U);
+}
+
 static  __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_rsqrt14_sd(__m128d __A, __m128d __B)
 {
@@ -941,6 +1123,24 @@
               (__mmask8) -1);
 }
 
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U);
+}
+
 static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_rcp14_pd(__m512d __A)
 {
@@ -968,6 +1168,24 @@
                  (__mmask8) -1);
 }
 
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_ps (),
+          (__mmask8) __U);
+}
+
 static  __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_rcp14_sd(__m128d __A, __m128d __B)
 {
@@ -978,6 +1196,24 @@
             (__mmask8) -1);
 }
 
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U);
+}
+
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_floor_ps(__m512 __A)
 {
@@ -2339,12 +2575,40 @@
                                          (I), (__v8di)_mm512_setzero_si512(), \
                                          (__mmask8)-1); })
 
+#define _mm512_mask_alignr_epi64( __W,  __U,  __A, __B, __imm) __extension__({\
+  (__m512i)__builtin_ia32_alignq512_mask ((__v8di) __A,\
+                                         (__v8di) __B, __imm,\
+                                         (__v8di) __W,\
+                                         (__mmask8) __U);\
+})
+
+#define _mm512_maskz_alignr_epi64( __U,  __A,  __B, __imm) __extension__({\
+  (__m512i)__builtin_ia32_alignq512_mask ((__v8di) __A,\
+                                         (__v8di) __B, __imm,\
+                                         (__v8di) _mm512_setzero_si512 (),\
+                                         (__mmask8) __U);\
+})
+
 #define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
+    (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
                                          (__v16si)(__m512i)(B), \
                                          (I), (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
+                                         (__mmask16)-1);\
+})
+                                         
+#define _mm512_mask_alignr_epi32( __W, __U, __A, __B,  __imm) __extension__ ({\
+    (__m512i) __builtin_ia32_alignd512_mask((__v16si) __A,\
+                                         (__v16si) __B, __imm,\
+                                         (__v16si) __W,\
+                                         (__mmask16) __U);\
+})
 
+#define _mm512_maskz_alignr_epi32( __U, __A, __B, __imm) __extension__({\
+    (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,\
+                                         (__v16si) __B, __imm,\
+                                         (__v16si) _mm512_setzero_si512 (),\
+                                         (__mmask16) __U);\
+})
 /* Vector Extract */
 
 #define _mm512_extractf64x4_pd(A, I) __extension__ ({                    \
@@ -3918,6 +4182,42 @@
                  (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_sd( __W, __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_getexpsd128_round_mask ((__v2df) __A,\
+          (__v2df) __B,\
+          (__v2df) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_sd( __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,\
+          (__v2df) __B,\
+          (__v2df) _mm_setzero_pd (),\
+          (__mmask8) __U,\
+          __R);\
+})
+
 #define _mm_getexp_round_ss( __A, __B, __R) __extension__ ({ \
 __builtin_ia32_getexpss128_round_mask ((__v4sf)( __A),\
                 (__v4sf)( __B), (__v4sf)  _mm_setzero_ps(), (__mmask8) -1,\
@@ -3931,6 +4231,42 @@
                 (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ss (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_ss( __W, __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_getexpss128_round_mask ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (__v4sf) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ss (__mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_pd (),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_ss( __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_getexpss128_round_mask ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (__v4sf) _mm_setzero_ps (),\
+          (__mmask8) __U,\
+          __R);\
+})
+
 #define _mm_getmant_round_sd( __A, __B, __C, __D, __R) __extension__ ({ \
 __builtin_ia32_getmantsd_round_mask ((__v2df)( __A),\
               (__v2df)( __B),\
@@ -3945,6 +4281,42 @@
               _MM_FROUND_CUR_DIRECTION);\
 })
 
+#define _mm_mask_getmant_sd( __W, __U, __A, __B, __C, __D) __extension__ ({\
+__builtin_ia32_getmantsd_round_mask ( (__v2df) __A,\
+          (__v2df) __B,\
+          (( __D) << 2) |( __C),\
+          (__v2df) __W,\
+          (__mmask8) __U,\
+          _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_mask_getmant_round_sd( __W, __U, __A, __B, __C, __D, __R)({\
+__builtin_ia32_getmantsd_round_mask ( (__v2df) __A,\
+          (__v2df) __B,\
+          (( __D) << 2) |( __C),\
+          (__v2df) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+#define _mm_maskz_getmant_sd( __U, __A, __B, __C, __D) __extension__ ({\
+__builtin_ia32_getmantsd_round_mask ( (__v2df) __A,\
+          (__v2df) __B,\
+          (( __D) << 2) |( __C),\
+          (__v2df) _mm_setzero_pd (),\
+          (__mmask8) __U,\
+          _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_maskz_getmant_round_sd( __U, __A, __B, __C, __D, __R) __extension__ ({\
+__builtin_ia32_getmantsd_round_mask ( (__v2df) __A,\
+          (__v2df) __B,\
+          (( __D) << 2) |( __C),\
+          (__v2df) _mm_setzero_pd (),\
+          (__mmask8) __U,\
+          __R);\
+})
+
 #define _mm_getmant_round_ss( __A, __B, __C, __D, __R) __extension__ ({ \
 __builtin_ia32_getmantss_round_mask ((__v4sf)( __A),\
               (__v4sf)( __B),\
@@ -3959,6 +4331,41 @@
               _MM_FROUND_CUR_DIRECTION);\
 })
 
+#define _mm_mask_getmant_ss( __W, __U, __A, __B, __C, __D) __extension__ ({\
+__builtin_ia32_getmantss_round_mask ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (( __D) << 2) |( __C),\
+          (__v4sf) __W,\
+          (__mmask8) __U,\
+          _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_mask_getmant_round_ss( __W, __U, __A, __B, __C, __D, __R)({\
+__builtin_ia32_getmantss_round_mask ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (( __D) << 2) |( __C),\
+          (__v4sf) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+#define _mm_maskz_getmant_ss( __U, __A, __B, __C, __D) __extension__ ({\
+__builtin_ia32_getmantss_round_mask ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (( __D) << 2) |( __C),\
+          (__v4sf) _mm_setzero_pd (),\
+          (__mmask8) __U,\
+          _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_maskz_getmant_round_ss( __U, __A, __B, __C, __D, __R) __extension__ ({\
+__builtin_ia32_getmantss_round_mask ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (( __D) << 2) |( __C),\
+          (__v4sf) _mm_setzero_ps (),\
+          (__mmask8) __U,\
+          __R);\
+})
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kmov (__mmask16 __A)
@@ -3974,6 +4381,3801 @@
 __builtin_ia32_vcomiss ((__v4sf) (__A), (__v4sf) (__B), ( __P), ( __R));\
 })
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+#define _mm_cvt_roundsd_si64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtsd2si64 ((__v2df)( __A),( __R));\
+})
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
+         __mmask16 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
+                   (__v16si) __I
+                   /* idx */ ,
+                   (__v16si) __B,
+                   (__mmask16) __U);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+                 (__v16si) __B,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_undefined_pd (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+             (__v4si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+             (__v2di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+             (__v8di) __Y,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+#define _mm512_ternarylogic_epi32( __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd512_mask ((__v16si)( __A),\
+                 (__v16si)( __B),\
+                 (__v16si)( __C),\
+                ( imm), (__mmask16) -1);\
+})
+
+#define _mm512_mask_ternarylogic_epi32( __A, __U, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd512_mask ((__v16si)( __A),\
+                 (__v16si)( __B),\
+                 (__v16si)( __C),\
+                ( imm), (__mmask16)( __U));\
+})
+
+#define _mm512_maskz_ternarylogic_epi32( __U, __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd512_maskz ((__v16si)( __A),\
+                  (__v16si)( __B),\
+                  (__v16si)( __C),\
+                 ( imm), (__mmask16)( __U));\
+})
+
+#define _mm512_ternarylogic_epi64( __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq512_mask ((__v8di)( __A),\
+                 (__v8di)( __B),\
+                 (__v8di)( __C),( imm),\
+                 (__mmask8) -1);\
+})
+
+#define _mm512_mask_ternarylogic_epi64( __A, __U, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq512_mask ((__v8di)( __A),\
+                 (__v8di)( __B),\
+                 (__v8di)( __C),( imm),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_ternarylogic_epi64( __U, __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq512_maskz ((__v8di)( __A),\
+                  (__v8di)( __B),\
+                  (__v8di)( __C),\
+                 ( imm), (__mmask8)( __U));\
+})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_setzero_ps (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_setzero_ps (),
+               (__mmask16) __U);
+}
+
+#define _mm_cvt_roundsd_i64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtsd2si64 ((__v2df)( __A),( __R));\
+})
+
+#define _mm_cvt_roundsd_si32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtsd2si32 ((__v2df)( __A),( __R));\
+})
+
+#define _mm_cvt_roundsd_i32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtsd2si32 ((__v2df)( __A),( __R));\
+})
+
+#define _mm_cvt_roundsd_u32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtsd2usi32 ((__v2df)( __A),( __R));\
+})
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvtsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsd_u64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtsd2usi64 ((__v2df)( __A),( __R));\
+})
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
+                 __A,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundss_si32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtss2si32 ((__v4sf)( __A),( __R));\
+})
+
+#define _mm_cvt_roundss_i32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtss2si32 ((__v4sf)( __A),( __R));\
+})
+
+#define _mm_cvt_roundss_si64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtss2si64 ((__v4sf)( __A),( __R));\
+})
+
+#define _mm_cvt_roundss_i64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtss2si64 ((__v4sf)( __A),( __R));\
+})
+
+#define _mm_cvt_roundss_u32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtss2usi32 ((__v4sf)( __A),( __R));\
+})
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvtss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundss_u64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvtss2usi64 ((__v4sf)( __A),( __R));\
+})
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvtss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
+                 __A,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtt_roundsd_i32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttsd2si32 ((__v2df)( __A),( __R));\
+})
+
+#define _mm_cvtt_roundsd_si32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttsd2si32 ((__v2df)( __A),( __R));\
+})
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_i32 (__m128d __A)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtt_roundsd_si64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttsd2si64 ((__v2df)( __A),( __R));\
+})
+
+#define _mm_cvtt_roundsd_i64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttsd2si64 ((__v2df)( __A),( __R));\
+})
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_i64 (__m128d __A)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtt_roundsd_u32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttsd2usi32 ((__v2df)( __A),( __R));\
+})
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvttsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtt_roundsd_u64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttsd2usi64 ((__v2df)( __A),( __R));\
+})
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
+                  __A,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtt_roundss_i32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttss2si32 ((__v4sf)( __A),( __R));\
+})
+
+#define _mm_cvtt_roundss_si32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttss2si32 ((__v4sf)( __A),( __R));\
+})
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_i32 (__m128 __A)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtt_roundss_i64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttss2si64 ((__v4sf)( __A),( __R));\
+})
+
+#define _mm_cvtt_roundss_si64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttss2si64 ((__v4sf)( __A),( __R));\
+})
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_i64 (__m128 __A)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtt_roundss_u32( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttss2usi32 ((__v4sf)( __A),( __R));\
+})
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvttss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtt_roundss_u64( __A, __R) __extension__ ({ \
+__builtin_ia32_vcvttss2usi64 ((__v4sf)( __A),( __R));\
+})
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvttss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
+                  __A,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
+            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
+              (__v8di) __I
+              /* idx */ ,
+              (__v8df) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
+            __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
+                   (__v16si) __I
+                   /* idx */ ,
+                   (__v16sf) __B,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
+         __mmask8 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
+                   (__v8di) __I
+                   /* idx */ ,
+                   (__v8di) __B,
+                   (__mmask8) __U);
+}
+
+#define _mm512_permute_pd( __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilpd512_mask ((__v8df)( __X),( __C),\
+                 (__v8df)\
+                 _mm512_undefined_pd (),\
+                 (__mmask8) -1);\
+})
+
+#define _mm512_mask_permute_pd( __W, __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilpd512_mask ((__v8df)( __X),( __C),\
+                 (__v8df)( __W),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_permute_pd( __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilpd512_mask ((__v8df)( __X),( __C),\
+                 (__v8df)\
+                 _mm512_setzero_pd (),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm512_permute_ps( __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilps512_mask ((__v16sf)( __X),( __C),\
+                (__v16sf)\
+                _mm512_undefined_ps (),\
+                (__mmask16) -1);\
+})
+
+#define _mm512_mask_permute_ps( __W, __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilps512_mask ((__v16sf)( __X),( __C),\
+                (__v16sf)( __W),\
+                (__mmask16)( __U));\
+})
+
+#define _mm512_maskz_permute_ps( __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilps512_mask ((__v16sf)( __X),( __C),\
+                (__v16sf)\
+                _mm512_setzero_ps (),\
+                (__mmask16)( __U));\
+})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_permutevar_pd (__m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+              (__v8di) __C,
+              (__v8df)
+              _mm512_undefined_pd (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+              (__v8di) __C,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+              (__v8di) __C,
+              (__v8df)
+              _mm512_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_permutevar_ps (__m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                   (__v16si) __C,
+                   (__v16sf)
+                   _mm512_undefined_ps (),
+                   (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                   (__v16si) __C,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                   (__v16si) __C,
+                   (__v16sf)
+                   _mm512_setzero_ps (),
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
+              /* idx */ ,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
+            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
+               /* idx */ ,
+               (__v8df) __A,
+               (__v8df) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
+            __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
+              /* idx */ ,
+              (__v16sf) __A,
+              (__v16sf) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
+              /* idx */ ,
+              (__v8di) __A,
+              (__v8di) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+             (__v16si) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+             (__v16si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+            (__v8di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+            (__v8di) __B, __U);
+}
+
+#define _mm512_cvtt_roundpd_epu32( __A, __R) __extension__ ({ \
+__builtin_ia32_cvttpd2udq512_mask ((__v8df)( __A),\
+                  (__v8si)\
+                  _mm256_undefined_si256 (),\
+                  (__mmask8) -1,( __R));\
+})
+
+#define _mm512_mask_cvtt_roundpd_epu32( __W, __U, __A, __R) __extension__ ({ \
+__builtin_ia32_cvttpd2udq512_mask ((__v8df)( __A),\
+                  (__v8si)( __W),\
+                  (__mmask8)( __U),( __R));\
+})
+
+#define _mm512_maskz_cvtt_roundpd_epu32( __U, __A, __R) __extension__ ({ \
+__builtin_ia32_cvttpd2udq512_mask ((__v8df)( __A),\
+                  (__v8si)\
+                  _mm256_setzero_si256 (),\
+                  (__mmask8)( __U),( __R));\
+})
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_undefined_si256 (),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+                 (__v16si) __B,
+                 (__v16si) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+                 (__v16si) __B,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+                  (__v8di) __B,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+                  (__v8di) __B,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+                  (__v8di) __B,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+                 (__v16si) __B,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+                 (__v16si) __B,
+                 (__v16si) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+                 (__v16si) __B,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+                  (__v8di) __B,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+                  (__v8di) __B,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+                  (__v8di) __B,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
+#define _mm_roundscale_round_sd( __A, __B, __imm, __R) __extension__ ({ \
+__builtin_ia32_rndscalesd_round_mask ((__v2df)( __A),\
+                (__v2df)( __B), (__v2df) _mm_setzero_pd(),\
+                (__mmask8) -1,( __imm),( __R));\
+})
+
+#define _mm_roundscale_sd( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_rndscalesd_round_mask ((__v2df)( __A),\
+                (__v2df)( __B), (__v2df) _mm_setzero_pd(),\
+                (__mmask8) -1, ( __imm),\
+               _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_mask_roundscale_sd( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_rndscalesd_round_mask ( (__v2df)( __A),\
+                (__v2df)( __B),\
+                (__v2df)( __W),\
+                (__mmask8)( __U),\
+                (__imm),\
+                _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_mask_roundscale_round_sd( __W, __U, __A, __B, __I, __R) __extension__ ({ \
+__builtin_ia32_rndscalesd_round_mask ( (__v2df)( __A),\
+                (__v2df)( __B),\
+                (__v2df)( __W),\
+                (__mmask8)( __U),\
+                __I,\
+                __R);\
+})
+
+#define _mm_maskz_roundscale_sd( __U, __A, __B, __I) __extension__ ({ \
+__builtin_ia32_rndscalesd_round_mask ( (__v2df)( __A),\
+                 (__v2df)( __B),\
+                (__v2df) _mm_setzero_pd (),\
+                (__mmask8)( __U),\
+                __I,\
+                _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_maskz_roundscale_round_sd( __U, __A, __B, __I, __R) __extension__ ({ \
+__builtin_ia32_rndscalesd_round_mask ( (__v2df)( __A),\
+                 (__v2df)( __B),\
+                (__v2df) _mm_setzero_pd (),\
+                (__mmask8)( __U),\
+                __I,\
+                __R);\
+})
+
+#define _mm_roundscale_round_ss( __A, __B, __imm, __R) __extension__ ({ \
+__builtin_ia32_rndscaless_round_mask ((__v4sf)( __A),\
+               (__v4sf)( __B),  (__v4sf) _mm_setzero_ps(),\
+                (__mmask8) -1, __imm, __R);\
+})
+
+#define _mm_roundscale_ss( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_rndscaless_round_mask ((__v4sf)( __A),\
+               (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),\
+                (__mmask8) -1, ( __imm),\
+               _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_mask_roundscale_ss( __W, __U, __A, __B, __I) __extension__ ({ \
+__builtin_ia32_rndscaless_round_mask ( (__v4sf) ( __A),\
+                 (__v4sf)( __B),\
+                 (__v4sf)( __W),\
+                 (__mmask8)( __U),\
+                 __I,\
+                 _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_mask_roundscale_round_ss( __W, __U, __A, __B, __I, __R) __extension__ ({ \
+__builtin_ia32_rndscaless_round_mask ( (__v4sf)( __A),\
+                (__v4sf)( __B),\
+                (__v4sf)( __W),\
+                (__mmask8)( __U),\
+                __I,\
+                __R);\
+})
+
+#define _mm_maskz_roundscale_ss( __U, __A, __B, __I) __extension__ ({ \
+__builtin_ia32_rndscaless_round_mask ( (__v4sf)( __A),\
+                 (__v4sf)( __B),\
+                (__v4sf) _mm_setzero_ps (),\
+                (__mmask8)( __U),\
+                __I,\
+                _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_maskz_roundscale_round_ss( __U, __A, __B, __I, __R) __extension__ ({ \
+__builtin_ia32_rndscaless_round_mask ( (__v4sf)( __A),\
+                 (__v4sf)( __B),\
+                (__v4sf) _mm_setzero_ps (),\
+                (__mmask8)( __U),\
+                __I,\
+                __R);\
+})
+
+#define _mm512_scalef_round_pd( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefpd512_mask ((__v8df)( __A),\
+                (__v8df)( __B),\
+                (__v8df)\
+                _mm512_undefined_pd (),\
+                (__mmask8) -1,( __R));\
+})
+
+#define _mm512_mask_scalef_round_pd( __W, __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefpd512_mask ((__v8df)( __A),\
+                (__v8df)( __B),\
+                (__v8df)( __W),\
+                (__mmask8)( __U),( __R));\
+})
+
+#define _mm512_maskz_scalef_round_pd( __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefpd512_mask ((__v8df)( __A),\
+                (__v8df)( __B),\
+                (__v8df)\
+                _mm512_setzero_pd (),\
+                (__mmask8)( __U),( __R));\
+})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_scalef_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_scalef_round_ps( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefps512_mask ((__v16sf)( __A),\
+               (__v16sf)( __B),\
+               (__v16sf)\
+               _mm512_undefined_ps (),\
+               (__mmask16) -1,( __R));\
+})
+
+#define _mm512_mask_scalef_round_ps( __W, __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefps512_mask ((__v16sf)( __A),\
+               (__v16sf)( __B),\
+               (__v16sf)( __W),\
+               (__mmask16)( __U),( __R));\
+})
+
+#define _mm512_maskz_scalef_round_ps( __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefps512_mask ((__v16sf)( __A),\
+               (__v16sf)( __B),\
+               (__v16sf)\
+               _mm512_setzero_ps (),\
+               (__mmask16)( __U),( __R));\
+})
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_scalef_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_undefined_ps (),
+               (__mmask16) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf) __W,
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_setzero_ps (),
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_scalef_round_sd( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefsd_round_mask ((__v2df)( __A),\
+              (__v2df)( __B), (__v2df) _mm_setzero_pd(),\
+              (__mmask8) -1,\
+              ( __R));\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
+              (__v2df)( __B), (__v2df) _mm_setzero_pd(),
+              (__mmask8) -1,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_sd( __W, __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefsd_round_mask ((__v2df)( __A),\
+              (__v2df)( __B), (__v2df) __W,\
+              (__mmask8) __U,\
+              ( __R));\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_sd( __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefsd_round_mask ((__v2df)( __A),\
+              (__v2df)( __B), (__v2df) _mm_setzero_pd (),\
+              (__mmask8) __U,\
+              ( __R));\
+})
+
+#define _mm_scalef_round_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefss_round_mask ((__v4sf)( __A),\
+             (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),\
+             (__mmask8) -1,\
+             ( __R));\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
+             (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_ss( __W, __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefss_round_mask ((__v4sf)( __A),\
+             (__v4sf)( __B), (__v4sf) __W,\
+             (__mmask8) __U,\
+             ( __R));\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_ss( __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_scalefss_round_mask ((__v4sf)( __A),\
+             (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),\
+             (__mmask8) __U,\
+             _MM_FROUND_CUR_DIRECTION);\
+})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi32 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+#define _mm512_mask_srai_epi32( __W, __U, __A, __B) __extension__ ({ \
+__builtin_ia32_psradi512_mask ((__v16si)( __A),( __B),\
+              (__v16si)( __W),\
+              (__mmask16)( __U));\
+})
+
+#define _mm512_maskz_srai_epi32( __U, __A, __B) __extension__ ({ \
+__builtin_ia32_psradi512_mask ((__v16si)( __A),( __B),\
+              (__v16si)\
+              _mm512_setzero_si512 (),\
+              (__mmask16)( __U));\
+})
+
+#define _mm512_srai_epi64( __A, __B) __extension__ ({ \
+__builtin_ia32_psraqi512_mask ((__v8di)( __A),( __B),\
+              (__v8di)\
+              _mm512_setzero_si512 (),\
+              (__mmask8) -1);\
+})
+
+#define _mm512_mask_srai_epi64( __W, __U, __A, __B) __extension__ ({ \
+__builtin_ia32_psraqi512_mask ((__v8di)( __A),( __B),\
+              (__v8di)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_srai_epi64( __U, __A, __B) __extension__ ({ \
+__builtin_ia32_psraqi512_mask ((__v8di)( __A),( __B),\
+              (__v8di)\
+              _mm512_setzero_si512 (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm512_shuffle_f32x4( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f32x4_mask ((__v16sf)( __A),\
+              (__v16sf)( __B),( __imm),\
+              (__v16sf)\
+              _mm512_undefined_ps (),\
+              (__mmask16) -1);\
+})
+
+#define _mm512_mask_shuffle_f32x4( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f32x4_mask ((__v16sf)( __A),\
+              (__v16sf)( __B),( __imm),\
+              (__v16sf)( __W),\
+              (__mmask16)( __U));\
+})
+
+#define _mm512_maskz_shuffle_f32x4( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f32x4_mask ((__v16sf)( __A),\
+              (__v16sf)( __B),( __imm),\
+              (__v16sf)\
+              _mm512_setzero_ps (),\
+              (__mmask16)( __U));\
+})
+
+#define _mm512_shuffle_f64x2( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f64x2_mask ((__v8df)( __A),\
+               (__v8df)( __B),( __imm),\
+               (__v8df)\
+               _mm512_undefined_pd (),\
+               (__mmask8) -1);\
+})
+
+#define _mm512_mask_shuffle_f64x2( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f64x2_mask ((__v8df)( __A),\
+               (__v8df)( __B),( __imm),\
+               (__v8df)( __W),\
+               (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_shuffle_f64x2( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f64x2_mask ((__v8df)( __A),\
+               (__v8df)( __B),( __imm),\
+               (__v8df)\
+               _mm512_setzero_pd (),\
+               (__mmask8)( __U));\
+})
+
+#define _mm512_shuffle_i32x4( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i32x4_mask ((__v16si)( __A),\
+               (__v16si)( __B),\
+              ( __imm),\
+               (__v16si)\
+               _mm512_setzero_si512 (),\
+               (__mmask16) -1);\
+})
+
+#define _mm512_mask_shuffle_i32x4( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i32x4_mask ((__v16si)( __A),\
+               (__v16si)( __B),\
+              ( __imm),\
+               (__v16si)( __W),\
+               (__mmask16)( __U));\
+})
+
+#define _mm512_maskz_shuffle_i32x4( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i32x4_mask ((__v16si)( __A),\
+               (__v16si)( __B),\
+              ( __imm),\
+               (__v16si)\
+               _mm512_setzero_si512 (),\
+               (__mmask16)( __U));\
+})
+
+#define _mm512_shuffle_i64x2( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i64x2_mask ((__v8di)( __A),\
+               (__v8di)( __B),( __imm),\
+               (__v8di)\
+               _mm512_setzero_si512 (),\
+               (__mmask8) -1);\
+})
+
+#define _mm512_mask_shuffle_i64x2( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i64x2_mask ((__v8di)( __A),\
+               (__v8di)( __B),( __imm),\
+               (__v8di)( __W),\
+               (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_shuffle_i64x2( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i64x2_mask ((__v8di)( __A),\
+               (__v8di)( __B),( __imm),\
+               (__v8di)\
+               _mm512_setzero_si512 (),\
+               (__mmask8)( __U));\
+})
+
+#define _mm512_shuffle_pd( __M, __V, __imm) __extension__ ({ \
+__builtin_ia32_shufpd512_mask ((__v8df)( __M),\
+              (__v8df)( __V),( __imm),\
+              (__v8df)\
+              _mm512_undefined_pd (),\
+              (__mmask8) -1);\
+})
+
+#define _mm512_mask_shuffle_pd( __W, __U, __M, __V, __imm) __extension__ ({ \
+__builtin_ia32_shufpd512_mask ((__v8df)( __M),\
+              (__v8df)( __V),( __imm),\
+              (__v8df)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_shuffle_pd( __U, __M, __V, __imm) __extension__ ({ \
+__builtin_ia32_shufpd512_mask ((__v8df)( __M),\
+              (__v8df)( __V),( __imm),\
+              (__v8df)\
+              _mm512_setzero_pd (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm512_shuffle_ps( __M, __V, __imm) __extension__ ({ \
+__builtin_ia32_shufps512_mask ((__v16sf)( __M),\
+             (__v16sf)( __V),( __imm),\
+             (__v16sf)\
+             _mm512_undefined_ps (),\
+             (__mmask16) -1);\
+})
+
+#define _mm512_mask_shuffle_ps( __W, __U, __M, __V, __imm) __extension__ ({ \
+__builtin_ia32_shufps512_mask ((__v16sf)( __M),\
+             (__v16sf)( __V),( __imm),\
+             (__v16sf)( __W),\
+             (__mmask16)( __U));\
+})
+
+#define _mm512_maskz_shuffle_ps( __U, __M, __V, __imm) __extension__ ({ \
+__builtin_ia32_shufps512_mask ((__v16sf)( __M),\
+             (__v16sf)( __V),( __imm),\
+             (__v16sf)\
+             _mm512_setzero_ps (),\
+             (__mmask16)( __U));\
+})
+
+#define _mm_sqrt_round_sd( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_sqrtsd_round_mask ((__v2df)( __B),\
+            (__v2df)( __A),(__v2df) _mm_setzero_pd(),\
+            (__mmask8) -1,\
+           ( __R));\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __B,
+                 (__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_sd( __W, __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_sqrtsd_round_mask ((__v2df)( __B),\
+            (__v2df)( __A),(__v2df) __W,\
+            (__mmask8) __U,\
+            ( __R));\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) 
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __B,
+                 (__v2df) __A,
+                (__v2df) _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_sd( __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_sqrtsd_round_mask ((__v2df)( __B),\
+            (__v2df)( __A),(__v2df) _mm_setzero_pd(),\
+            (__mmask8) __U,\
+            ( __R));\
+})
+
+#define _mm_sqrt_round_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_sqrtss_round_mask ((__v4sf)( __B),\
+                 (__v4sf)( __A),(__v4sf) _mm_setzero_ps(),\
+                 (__mmask8) -1,\
+                 ( __R));\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __B,
+                 (__v4sf) __A,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_ss( __W, __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_sqrtss_round_mask ((__v4sf)( __B),\
+                 (__v4sf)( __A),(__v4sf) __W,\
+                 (__mmask8) __U,\
+                 ( __R));\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_ss( __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_sqrtss_round_mask ((__v4sf)( __B),\
+                 (__v4sf)( __A),(__v4sf) _mm_setzero_ps(),\
+                 (__mmask8) __U,\
+                 __R);\
+})
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x4 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf)
+                 _mm512_undefined_ps (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf) __O,
+                 __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcast_f64x4 (__m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df)
+                  _mm512_undefined_pd (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df) __O,
+                  __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x4 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si)
+                  _mm512_undefined_epi32 (),
+                  (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si) __O,
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i64x4 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di)
+                  _mm512_undefined_epi32 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di) __O,
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+              (__v8df) __O, __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+              (__v8df)
+              _mm512_setzero_pd (),
+              __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+             (__v16sf) __O, __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+             (__v16sf)
+             _mm512_setzero_ps (),
+             __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) _mm_undefined_si128 (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) _mm256_undefined_si256 (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi32 (__m512i __A)
+{
+  __v8si __O;
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) _mm_undefined_si128 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) _mm256_undefined_si256 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) _mm256_setzero_si256 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) _mm_undefined_si128 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) _mm256_undefined_si256 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) _mm256_setzero_si256 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) _mm_undefined_si128 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) _mm_undefined_si128 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) _mm256_undefined_si256 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) _mm_undefined_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) _mm256_undefined_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) _mm_undefined_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+#define _mm512_extracti32x4_epi32( __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x4_mask ((__v16si)( __A),\
+                 (__imm),\
+                 (__v4si) _mm_undefined_si128 (),\
+                 (__mmask8) -1);\
+})
+
+#define _mm512_mask_extracti32x4_epi32( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x4_mask ((__v16si)( __A),\
+                 ( __imm),\
+                 (__v4si)( __W),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_extracti32x4_epi32( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x4_mask ((__v16si)( __A),\
+                 ( __imm),\
+                 (__v4si) _mm_setzero_si128 (),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm512_extracti64x4_epi64( __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x4_mask ((__v8di)( __A),\
+                 ( __imm),\
+                 (__v4di) _mm256_undefined_si256 (),\
+                 (__mmask8) -1);\
+})
+
+#define _mm512_mask_extracti64x4_epi64( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x4_mask ((__v8di)( __A),\
+                 ( __imm),\
+                 (__v4di)( __W),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_extracti64x4_epi64( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x4_mask ((__v8di)( __A),\
+                 ( __imm),\
+                 (__v4di) _mm256_setzero_si256 (),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm512_insertf64x4( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x4_mask ((__v8df)( __A),\
+                (__v4df)( __B),\
+                ( __imm),\
+                (__v8df) _mm512_undefined_pd (),\
+                (__mmask8) -1);\
+})
+
+#define _mm512_mask_insertf64x4( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x4_mask ((__v8df)( __A),\
+                (__v4df)( __B),\
+                ( __imm),\
+                (__v8df)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_insertf64x4( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x4_mask ((__v8df)( __A),\
+                (__v4df)( __B),\
+                ( __imm),\
+                (__v8df) _mm512_setzero_pd (),\
+                (__mmask8)( __U));\
+})
+
+#define _mm512_inserti64x4( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x4_mask ((__v8di)( __A),\
+                (__v4di)( __B),\
+                ( __imm),\
+                (__v8di) _mm512_setzero_si512 (),\
+                (__mmask8) -1);\
+})
+
+#define _mm512_mask_inserti64x4( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x4_mask ((__v8di)( __A),\
+                (__v4di)( __B),\
+                ( __imm),\
+                (__v8di)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_inserti64x4( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x4_mask ((__v8di)( __A),\
+                (__v4di)( __B),\
+                ( __imm),\
+                (__v8di) _mm512_setzero_si512 (),\
+                (__mmask8)( __U));\
+})
+
+#define _mm512_getmant_round_pd( __A, __B, __C, __R) __extension__ ({ \
+__builtin_ia32_getmantpd512_mask ((__v8df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v8df) _mm512_undefined_pd (),\
+                 (__mmask8) -1,( __R));\
+})
+
+#define _mm512_mask_getmant_round_pd( __W, __U, __A, __B, __C, __R) __extension__ ({ \
+__builtin_ia32_getmantpd512_mask ((__v8df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v8df)( __W),(__mmask8)( __U),\
+                 ( __R));\
+})
+
+#define _mm512_maskz_getmant_round_pd( __U, __A, __B, __C, __R) __extension__ ({ \
+__builtin_ia32_getmantpd512_mask ((__v8df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v8df) _mm512_setzero_pd (),\
+                 (__mmask8)( __U),( __R));\
+})
+
+#define _mm512_getmant_pd( __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantpd512_mask ((__v8df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v8df) _mm512_setzero_pd (),\
+                 (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm512_mask_getmant_pd( __W, __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantpd512_mask ((__v8df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v8df)( __W), (__mmask8)( __U), _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm512_maskz_getmant_pd( __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantpd512_mask ((__v8df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v8df) _mm512_setzero_pd (),\
+                 (__mmask8)( __U), _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm512_getmant_round_ps( __A, __B, __C, __R) __extension__ ({ \
+__builtin_ia32_getmantps512_mask ((__v16sf)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v16sf) _mm512_undefined_ps (),\
+                 (__mmask16) -1,( __R));\
+})
+
+#define _mm512_mask_getmant_round_ps( __W, __U, __A, __B, __C, __R) __extension__ ({ \
+__builtin_ia32_getmantps512_mask ((__v16sf)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v16sf)( __W),(__mmask16)( __U),\
+                 ( __R));\
+})
+
+#define _mm512_maskz_getmant_round_ps( __U, __A, __B, __C, __R) __extension__ ({ \
+__builtin_ia32_getmantps512_mask ((__v16sf)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v16sf) _mm512_setzero_ps (),\
+                 ( __U),( __R));\
+})
+
+#define _mm512_getmant_ps( __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps512_mask ((__v16sf)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v16sf) _mm512_undefined_ps (),\
+                 (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm512_mask_getmant_ps( __W, __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps512_mask ((__v16sf)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v16sf)( __W),(__mmask16) ( __U),\
+                 _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm512_maskz_getmant_ps( __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps512_mask ((__v16sf)( __A),\
+                (__C << 2) |( __B),\
+                (__v16sf) _mm512_setzero_ps (),\
+                (__mmask16)( __U),_MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm512_getexp_round_pd( __A, __R) __extension__ ({ \
+__builtin_ia32_getexppd512_mask ((__v8df)( __A),\
+                (__v8df) _mm512_undefined_pd (),\
+                (__mmask8) -1,( __R));\
+})
+
+#define _mm512_mask_getexp_round_pd( __W, __U, __A, __R) __extension__ ({ \
+__builtin_ia32_getexppd512_mask ((__v8df)( __A),\
+                (__v8df)( __W),\
+                (__mmask8)( __U),( __R));\
+})
+
+#define _mm512_maskz_getexp_round_pd( __U, __A, __R) __extension__ ({ \
+__builtin_ia32_getexppd512_mask ((__v8df)( __A),\
+                (__v8df) _mm512_setzero_pd (),\
+                (__mmask8)( __U),( __R));\
+})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_getexp_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_getexp_round_ps( __A, __R) __extension__ ({ \
+__builtin_ia32_getexpps512_mask ((__v16sf)( __A),\
+               (__v16sf) _mm512_undefined_ps (),\
+               (__mmask16) -1,( __R));\
+})
+
+#define _mm512_mask_getexp_round_ps( __W, __U, __A, __R) __extension__ ({ \
+__builtin_ia32_getexpps512_mask ((__v16sf)( __A),\
+               (__v16sf)( __W),\
+               (__mmask16)( __U),( __R));\
+})
+
+#define _mm512_maskz_getexp_round_ps( __U, __A, __R) __extension__ ({ \
+__builtin_ia32_getexpps512_mask ((__v16sf)( __A),\
+               (__v16sf) _mm512_setzero_ps (),\
+               (__mmask16)( __U),( __R));\
+})
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_getexp_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_undefined_ps (),
+               (__mmask16) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_setzero_ps (),
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_i64gather_ps( __index, __addr, __scale) __extension__ ({ \
+__builtin_ia32_gatherdiv16sf ((__v8sf) _mm256_undefined_ps (),\
+                              __addr, (__v8di) __index, (__mmask8) -1, __scale);\
+})
+
+#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\
+                                  __addr, __scale) __extension__({\
+__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
+                              __addr,(__v8di) __index, __mask, __scale);\
+})
+
+#define _mm512_i64gather_epi32(__index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gatherdiv16si ((__v8si) _mm256_undefined_ps (),\
+                              __addr, (__v8di) __index, (__mmask8) -1 , __scale);\
+})
+
+#define _mm512_mask_i64gather_epi32( __v1_old,  __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gatherdiv16si ((__v8si) __v1_old,\
+                              __addr, (__v8di) __index, __mask , __scale);\
+})
+
+#define _mm512_i64gather_pd(__index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gatherdiv8df ((__v8df) _mm512_undefined_pd(),\
+                              __addr, (__v8di) __index, (__mmask8) -1 , __scale);\
+})
+
+#define _mm512_mask_i64gather_pd( __v1_old,  __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gatherdiv8df ((__v8df) __v1_old,\
+                              __addr, (__v8di) __index, __mask , __scale);\
+})
+
+#define _mm512_i64gather_epi64(__index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gatherdiv8di ((__v8di) _mm512_undefined_pd(),\
+                              __addr, (__v8di) __index, (__mmask8) -1 , __scale);\
+})
+
+#define _mm512_mask_i64gather_epi64( __v1_old,  __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gatherdiv8di ((__v8di) __v1_old,\
+                              __addr, (__v8di) __index, __mask , __scale);\
+})
+
+#define _mm512_i32gather_ps(__index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gathersiv16sf ((__v16sf) _mm512_undefined_ps(),\
+                              __addr, (__v16si) __index, (__mmask8) -1 , __scale);\
+})
+
+#define _mm512_mask_i32gather_ps( __v1_old,  __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gathersiv16sf ((__v16sf) __v1_old,\
+                              __addr, (__v16si) __index, __mask , __scale);\
+})
+
+#define _mm512_i32gather_epi32(__index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gathersiv16si ((__v16sf) _mm512_undefined_epi32(),\
+                              __addr, (__v16si) __index, (__mmask8) -1 , __scale);\
+})
+
+#define _mm512_mask_i32gather_epi32( __v1_old,  __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gathersiv16si ((__v16sf) __v1_old,\
+                              __addr, (__v16si) __index, __mask , __scale);\
+})
+
+#define _mm512_i32gather_pd(__index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gathersiv8df ((__v8df) _mm512_undefined_pd(),\
+                              __addr, (__v8si) __index, (__mmask8) -1 , __scale);\
+})
+
+#define _mm512_mask_i32gather_pd( __v1_old,  __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gathersiv8df ((__v8df) __v1_old,\
+                              __addr, (__v8si) __index, __mask , __scale);\
+})
+
+#define _mm512_i32gather_epi64(__index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gathersiv8di ((__v8di) _mm512_undefined_epi32(),\
+                              __addr, (__v8si) __index, (__mmask8) -1 , __scale);\
+})
+
+#define _mm512_mask_i32gather_epi64( __v1_old,  __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gathersiv8di ((__v8di) __v1_old,\
+                              __addr, (__v8si) __index, __mask , __scale);\
+})
+
+#define _mm512_i64scatter_ps(__addr,__index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scatterdiv16sf(__addr, (__mmask8) -1,\
+                             (__v8di) __index, (__v8sf) __v1, __scale);\
+})
+
+#define _mm512_mask_i64scatter_ps(__addr, __mask,__index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scatterdiv16sf(__addr, __mask,\
+                             (__v8di) __index, (__v8sf) __v1, __scale);\
+})
+
+#define _mm512_i64scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scatterdiv16si (__addr, (__mmask8) -1,\
+                              (__v8di) __index, (__v8si) __v1, __scale);\
+})
+
+#define _mm512_mask_i64scatter_epi32(__addr, __mask, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index,\
+                              (__v8si) __v1, __scale);\
+})
+
+#define _mm512_i64scatter_pd( __addr, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scatterdiv8df (__addr, (__mmask8) -1,\
+                             (__v8di) __index, (__v8df) __v1, __scale);\
+})
+
+#define _mm512_mask_i64scatter_pd( __addr, __mask, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index,\
+                             (__v8df) __v1, __scale);\
+})
+
+#define _mm512_i64scatter_epi64( __addr, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scatterdiv8di (__addr, (__mmask8) -1,\
+                             (__v8di) __index, (__v8di) __v1, __scale);\
+})
+
+#define _mm512_mask_i64scatter_epi64( __addr, __mask, __index,  __v1,  __scale) __extension__ ({\
+__builtin_ia32_scatterdiv8di(__addr, __mask, (__v8di) __index,\
+                            (__v8di) __v1, __scale);\
+})
+
+#define _mm512_i32scatter_ps( __addr, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scattersiv16sf (__addr, (__mmask16) -1,\
+                              (__v16si) __index, (__v16sf) __v1, __scale);\
+})
+
+#define _mm512_mask_i32scatter_ps( __addr, __mask, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index,\
+                              (__v16sf) __v1, __scale);\
+})
+
+#define _mm512_i32scatter_epi32( __addr, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scattersiv16si (__addr, (__mmask16) -1,\
+                              (__v16si) __index, (__v16si) __v1, __scale);\
+})
+
+#define _mm512_mask_i32scatter_epi32( __addr, __mask, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index,\
+                              (__v16si) __v1, __scale);\
+})
+
+#define _mm512_i32scatter_pd( __addr, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scattersiv8df (__addr, (__mmask8) -1,\
+                             (__v8si) __index, (__v8df) __v1, __scale);\
+})
+
+#define _mm512_mask_i32scatter_pd( __addr, __mask, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index,\
+                             (__v8df) __v1, __scale);\
+})
+
+#define _mm512_i32scatter_epi64( __addr, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scattersiv8di (__addr, (__mmask8) -1,\
+                             (__v8si) __index, (__v8di) __v1, __scale);\
+})
+
+#define _mm512_mask_i32scatter_epi64( __addr, __mask, __index, __v1, __scale) __extension__ ({\
+__builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index,\
+                             (__v8di) __v1, __scale);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmadd_round_ss( __W, __U, __A, __B, __R) __extension__({\
+__builtin_ia32_vfmaddss3_mask ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (__v4sf) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_ss( __U, __A, __B, __C, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (__v4sf) __C,\
+          (__mmask8) __U,\
+          _MM_FROUND_CUR_DIRECTION);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_ss( __W, __X, __Y, __U, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,\
+          (__v4sf) __X,\
+          (__v4sf) __Y,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+          (__v4sf) -(__B),
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmsub_round_ss( __W, __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_mask ((__v4sf) __A,\
+          (__v4sf) -(__B),\
+          (__v4sf) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) -(__C),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_ss( __U, __A, __B, __C, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,\
+          (__v4sf) __B,\
+          (__v4sf) -(__C),\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) -(__Y),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_ss( __W, __X, __Y, __U, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,\
+          (__v4sf) __X,\
+          (__v4sf) -(__Y),\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) -(__A),
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmadd_round_ss( __W, __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_mask ((__v4sf) -(__A),\
+          (__v4sf) __B,\
+          (__v4sf) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) -(__A),
+          (__v4sf) __B,
+          (__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_ss( __U, __A, __B, __C, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_maskz ((__v4sf) -(__A),\
+          (__v4sf) __B,\
+          (__v4sf) __C,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) -(__W),
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_ss( __W,  __X, __Y, __U, __R) __extension__({\
+__builtin_ia32_vfmaddss3_mask3 ((__v4sf) -(__W),\
+          (__v4sf) __X,\
+          (__v4sf) __Y,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) -(__A),
+          (__v4sf) -(__B),
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmsub_round_ss( __W, __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_mask ((__v4sf) -(__A),\
+          (__v4sf) -(__B),\
+          (__v4sf) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) -(__A),
+          (__v4sf) __B,
+          (__v4sf) -(__C),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_ss( __U, __A, __B, __C, __R) __extension__ ({\
+__builtin_ia32_vfmaddss3_maskz((__v4sf) -(__A),\
+          (__v4sf) __B,\
+          (__v4sf) -(__C),\
+          (__mmask8) __U,\
+          _MM_FROUND_CUR_DIRECTION);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) -(__W),
+          (__v4sf) __X,
+          (__v4sf) -(__Y),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_ss( __W, __X, __Y, __U, __R) __extension__({\
+__builtin_ia32_vfmaddss3_mask3 ((__v4sf) -(__W),\
+          (__v4sf) __X,\
+          (__v4sf) -(__Y),\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_sd (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmadd_round_sd( __W, __U, __A, __B, __R) __extension__({\
+__builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,\
+          (__v2df) __B,\
+          (__v2df) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_sd (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_sd( __U, __A, __B, __C, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,\
+          (__v2df) __B,\
+          (__v2df) __C,\
+          (__mmask8) __U,\
+          _MM_FROUND_CUR_DIRECTION);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_sd (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_sd( __W, __X, __Y, __U, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,\
+          (__v2df) __X,\
+          (__v2df) __Y,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_sd (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+          (__v2df) -(__B),
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmsub_round_sd( __W, __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,\
+          (__v2df) -(__B),\
+          (__v2df) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_sd (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) -(__C),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_sd( __U, __A, __B, __C, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,\
+          (__v2df) __B,\
+          (__v2df) -(__C),\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_sd (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+          (__v2df) __X,
+          (__v2df) -(__Y),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_sd( __W, __X, __Y, __U, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,\
+          (__v2df) __X,\
+          (__v2df) -(__Y),\
+          (__mmask8) __U, __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_sd (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_mask ( (__v2df) -(__A),
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmadd_round_sd( __W, __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_mask ( (__v2df) -(__A),\
+          (__v2df) __B,\
+          (__v2df) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_sd (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) -(__A),
+          (__v2df) __B,
+          (__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_sd( __U, __A, __B, __C, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_maskz ( (__v2df) -(__A),\
+          (__v2df) __B,\
+          (__v2df) __C,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_sd (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) -(__W),
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_sd( __W,  __X, __Y, __U, __R) __extension__({\
+__builtin_ia32_vfmaddsd3_mask3 ((__v2df) -(__W),\
+          (__v2df) __X,\
+          (__v2df) __Y,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_sd (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_mask ( (__v2df) -(__A),
+          (__v2df) -(__B),
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmsub_round_sd( __W, __U, __A, __B, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_mask ( (__v2df) -(__A),\
+          (__v2df) -(__B),\
+          (__v2df) __W,\
+          (__mmask8) __U,\
+          __R);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_sd (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) -(__A),
+          (__v2df) __B,
+          (__v2df) -(__C),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_sd( __U, __A, __B, __C, __R) __extension__ ({\
+__builtin_ia32_vfmaddsd3_maskz( (__v2df) -(__A),\
+          (__v2df) __B,\
+          (__v2df) -(__C),\
+          (__mmask8) __U,\
+          _MM_FROUND_CUR_DIRECTION);\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_sd (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 
+{
+ return (__m128) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) -(__W),
+          (__v2df) __X,
+          (__v2df) -(__Y),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_sd( __W, __X, __Y, __U, __R) __extension__({\
+__builtin_ia32_vfmaddsd3_mask3 ((__v2df) -(__W),\
+          (__v2df) __X,\
+          (__v2df) -(__Y),\
+          (__mmask8) __U,\
+          __R);\
+})
+
+#define _mm512_permutex_pd( __X, __M) __extension__ ({ \
+__builtin_ia32_permdf512_mask ((__v8df)( __X),( __M),\
+              (__v8df) _mm512_undefined_pd (),\
+              (__mmask8) -1);\
+})
+
+#define _mm512_mask_permutex_pd( __W, __U, __X, __M) __extension__ ({ \
+__builtin_ia32_permdf512_mask ((__v8df)( __X),( __M),\
+              (__v8df)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm512_maskz_permutex_pd( __U, __X, __M) __extension__ ({ \
+__builtin_ia32_permdf512_mask ((__v8df)( __X),( __M),\
+              (__v8df) _mm512_setzero_pd (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm512_permutex_epi64( __X, __I) __extension__ ({ \
+__builtin_ia32_permdi512_mask ((__v8di)( __X),( __I),\
+              (__v8di) _mm512_undefined_epi32 (),\
+              (__mmask8) (-1));\
+})
+
+#define _mm512_mask_permutex_epi64( __W, __M, __X, __I) __extension__ ({ \
+__builtin_ia32_permdi512_mask ((__v8di)( __X),( __I),\
+              (__v8di)( __W),\
+              (__mmask8)( __M));\
+})
+
+#define _mm512_maskz_permutex_epi64( __M, __X, __I) __extension__ ({ \
+__builtin_ia32_permdi512_mask ((__v8di)( __X),( __I),\
+              (__v8di) _mm512_setzero_si512 (),\
+              (__mmask8)( __M));\
+})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) _mm512_undefined_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) _mm512_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) _mm512_undefined_epi32 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
+             __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) __W,
+                 __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) _mm512_undefined_ps (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) _mm512_setzero_ps (),
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) _mm512_undefined_epi32 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
+             __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) __W,
+                 __M);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kand (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kandn (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestc (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A,
+            (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestz (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A,
+            (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxnor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_si512 (__m512i * __P, __m512i __A)
+{
+  __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_stream_load_si512 (void *__P)
+{
+  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_pd (double *__P, __m512d __A)
+{
+  __builtin_ia32_movntpd512 (__P, (__v8df) __A);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_ps (float *__P, __m512 __A)
+{
+  __builtin_ia32_movntps512 (__P, (__v16sf) __A);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                  (__v8df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U);
+}
+
+#define _mm_cmp_round_ss_mask( __X, __Y, __P, __R) __extension__ ({ \
+__builtin_ia32_cmpss_mask ((__v4sf)( __X),\
+                 (__v4sf)( __Y), __P,\
+                 (__mmask8) -1, __R);\
+})
+
+#define _mm_mask_cmp_round_ss_mask( __M, __X, __Y, __P, __R) __extension__ ({ \
+__builtin_ia32_cmpss_mask ((__v4sf)( __X),\
+                 (__v4sf)( __Y), __P,\
+                 (__mmask8)( __M), __R);\
+})
+
+#define _mm_cmp_ss_mask( __X, __Y, __P) __extension__ ({ \
+__builtin_ia32_cmpss_mask ((__v4sf)( __X),\
+                 (__v4sf)( __Y),( __P),\
+                 (__mmask8) -1,\
+                 _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_mask_cmp_ss_mask( __M, __X, __Y, __P) __extension__ ({ \
+__builtin_ia32_cmpss_mask ((__v4sf)( __X),\
+                 (__v4sf)( __Y),( __P),\
+                 (__mmask8)( __M),\
+                 _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_cmp_round_sd_mask( __X, __Y, __P,__R) __extension__ ({ \
+__builtin_ia32_cmpsd_mask ((__v2df)( __X),\
+                 (__v2df)( __Y), __P,\
+                 (__mmask8) -1, __R);\
+})
+
+#define _mm_mask_cmp_round_sd_mask( __M, __X, __Y, __P, __R) __extension__ ({ \
+__builtin_ia32_cmpsd_mask ((__v2df)( __X),\
+                 (__v2df)( __Y), __P,\
+                 (__mmask8)( __M), __R);\
+})
+
+#define _mm_cmp_sd_mask( __X, __Y, __P) __extension__ ({ \
+__builtin_ia32_cmpsd_mask ((__v2df)( __X),\
+                 (__v2df)( __Y),( __P),\
+                 (__mmask8) -1,\
+                 _MM_FROUND_CUR_DIRECTION);\
+})
+
+#define _mm_mask_cmp_sd_mask( __M, __X, __Y, __P) __extension__ ({ \
+__builtin_ia32_cmpsd_mask ((__v2df)( __X),\
+                 (__v2df)( __Y),( __P),\
+                 (__mmask8)( __M),\
+                 _MM_FROUND_CUR_DIRECTION);\
+})
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_movehdup_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+               (__v16sf)
+               _mm512_undefined_ps (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+               (__v16sf)
+               _mm512_setzero_ps (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_moveldup_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+               (__v16sf)
+               _mm512_undefined_ps (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+               (__v16sf)
+               _mm512_setzero_ps (),
+               (__mmask16) __U);
+}
+
+#define _mm512_shuffle_epi32( __A, __I) __extension__ ({ \
+__builtin_ia32_pshufd512_mask ((__v16si)( __A),\
+              ( __I),\
+              (__v16si) _mm512_undefined_epi32 (),\
+              (__mmask16) -1);\
+})
+
+#define _mm512_mask_shuffle_epi32( __W, __U, __A, __I) __extension__ ({ \
+__builtin_ia32_pshufd512_mask ((__v16si)( __A),\
+              ( __I),\
+              (__v16si)( __W),\
+              (__mmask16)( __U));\
+})
+
+#define _mm512_maskz_shuffle_epi32( __U, __A, __I) __extension__ ({ \
+__builtin_ia32_pshufd512_mask ((__v16si)( __A),\
+              ( __I),\
+              (__v16si) _mm512_setzero_si512 (),\
+              (__mmask16)( __U));\
+})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                (__v8di) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+              (__v8df) _mm512_setzero_pd(),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+              (__v8di) _mm512_setzero_pd(),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+                   (__v16sf) _mm512_setzero_ps(),
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+              (__v16si) _mm512_setzero_ps(),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_setzero_ps(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                (__v16si) _mm512_setzero_ps(),
+                (__mmask16) __U);
+}
+
+#define _mm512_cvt_roundps_pd( __A, __R) __extension__ ({ \
+__builtin_ia32_cvtps2pd512_mask ((__v8sf)( __A),\
+                (__v8df)\
+                _mm512_undefined_pd (),\
+                (__mmask8) -1,( __R));\
+})
+
+#define _mm512_mask_cvt_roundps_pd( __W, __U, __A, __R) __extension__ ({ \
+__builtin_ia32_cvtps2pd512_mask ((__v8sf)( __A),\
+                (__v8df)( __W),\
+                (__mmask8)( __U),( __R));\
+})
+
+#define _mm512_maskz_cvt_roundps_pd( __U, __A, __R) __extension__ ({ \
+__builtin_ia32_cvtps2pd512_mask ((__v8sf)( __A),\
+                (__v8df)\
+                _mm512_setzero_pd (),\
+                (__mmask8)( __U),( __R));\
+})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtps_pd (__m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df)
+                _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
+              (__v8df)
+              _mm512_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
+             (__v16sf) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
+             (__v16sf)
+             _mm512_setzero_ps (),
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
+            (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
+            (__mmask16) __U);
+}
+
+#define _mm_cvt_roundsd_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsd2ss_round_mask ((__v4sf)( __A),\
+             (__v2df)( __B),\
+             (__v4sf) _mm_undefined_ps (),\
+             (__mmask8) -1,\
+             ( __R));\
+})
+
+#define _mm_mask_cvt_roundsd_ss( __W, __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsd2ss_round_mask ((__v4sf)( __A),\
+             (__v2df)( __B),\
+             (__v4sf) __W,\
+             (__mmask8) __U,\
+             ( __R));\
+})
+
+#define _mm_maskz_cvt_roundsd_ss( __U, __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsd2ss_round_mask ((__v4sf)( __A),\
+             (__v2df)( __B),\
+             (__v4sf) _mm_setzero_ps (),\
+             (__mmask8) __U,\
+             ( __R));\
+})
+
+#define _mm_cvt_roundi64_sd( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsi2sd64 ((__v2df)( __A),( __B),( __R));\
+})
+
+#define _mm_cvt_roundsi64_sd( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsi2sd64 ((__v2df)( __A),( __B),( __R));\
+})
+
+#define _mm_cvt_roundsi32_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsi2ss32 ((__v4sf)( __A),( __B),( __R));\
+})
+
+#define _mm_cvt_roundi32_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsi2ss32 ((__v4sf)( __A),( __B),( __R));\
+})
+
+#define _mm_cvt_roundsi64_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsi2ss64 ((__v4sf)( __A),( __B),( __R));\
+})
+
+#define _mm_cvt_roundi64_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtsi2ss64 ((__v4sf)( __A),( __B),( __R));\
+})
+
+#define _mm_cvt_roundss_sd( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtss2sd_round_mask ((__v2df)( __A),\
+              (__v4sf)( __B),\
+              (__v2df) _mm_undefined_pd (),\
+              (__mmask8)-1,\
+              ( __R));\
+})
+
+#define _mm_mask_cvt_roundss_sd(__W, __U,__A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtss2sd_round_mask ((__v2df)( __A),\
+              (__v4sf)( __B),\
+              (__v2df) __W,\
+              (__mmask8) __U,\
+              ( __R));\
+})
+
+#define _mm_maskz_cvt_roundss_sd( __U,__A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtss2sd_round_mask ((__v2df)( __A),\
+              (__v4sf)( __B),\
+              (__v2df) _mm_setzero_pd(),\
+              (__mmask8) __U,\
+              ( __R));\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtu32_sd (__m128d __A, unsigned __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
+}
+
+#define _mm_cvt_roundu64_sd( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtusi2sd64 ((__v2df)( __A),( __B),( __R));\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundu32_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtusi2ss32 ((__v4sf)( __A),( __B),( __R));\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtu32_ss (__m128 __A, unsigned __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundu64_ss( __A, __B, __R) __extension__ ({ \
+__builtin_ia32_cvtusi2ss64 ((__v4sf)( __A),( __B),( __R));\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
+                _MM_FROUND_CUR_DIRECTION);
+}
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512pfintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512pfintrin.h
new file mode 100644
index 0000000..54e94f1
--- /dev/null
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512pfintrin.h
@@ -0,0 +1,92 @@
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512PFINTRIN_H
+#define __AVX512PFINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
+
+#define _mm512_mask_prefetch_i32gather_pd( index,  mask, addr,  scale, hint) __extension__ ({\
+__builtin_ia32_gatherpfdpd (mask, (__v8si) index, (long long const *) addr, scale, hint);\
+})
+
+#define _mm512_mask_prefetch_i32gather_ps( index,  mask, addr, scale,  hint) ({\
+__builtin_ia32_gatherpfdps (mask, (__v16si) index, (int const *) addr, scale, hint);\
+})
+
+#define _mm512_mask_prefetch_i64gather_pd( index,  mask, addr,  scale, hint) __extension__ ({\
+__builtin_ia32_gatherpfqpd (mask, (__v8di) index, (long long const *) addr, scale, hint);\
+})
+
+#define _mm512_mask_prefetch_i64gather_ps( index,  mask, addr, scale,  hint) ({\
+__builtin_ia32_gatherpfqps (mask, (__v8di) index, (int const *) addr, scale, hint);\
+})
+
+#define _mm512_prefetch_i32scatter_pd(addr,  index,  scale,  hint) __extension__ ({\
+__builtin_ia32_scatterpfdpd ((__mmask8) -1, (__v8si) index, \
+                            (void  *)addr, scale, hint);\
+})
+
+#define _mm512_mask_prefetch_i32scatter_pd(addr,  mask,  index,  scale,  hint) __extension__ ({\
+__builtin_ia32_scatterpfdpd (mask, (__v8si) index, (void  *) addr,\
+                             scale, hint);\
+})
+
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
+__builtin_ia32_scatterpfdps ((__mmask16) -1, (__v16si) index, (void  *) addr,\
+                             scale, hint);\
+})
+
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+__builtin_ia32_scatterpfdps (mask, (__v16si) index, (void  *) addr,\
+                             scale, hint);\
+})
+
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
+__builtin_ia32_scatterpfqpd ((__mmask8) -1, (__v8di) index, (void  *) addr,\
+                             scale, hint);\
+})
+
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+__builtin_ia32_scatterpfqpd (mask, (__v8di) index, (void  *) addr,\
+                             scale, hint);\
+})
+
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
+__builtin_ia32_scatterpfqps ((__mmask8) -1, (__v8di) index, (void  *) addr,\
+                             scale, hint);\
+})
+
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+__builtin_ia32_scatterpfqps (mask, (__v8di) index, (void  *) addr,\
+                             scale, hint);\
+})
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vbmiintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vbmiintrin.h
index 7ac9fe7..a2c3b8f 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vbmiintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vbmiintrin.h
@@ -79,6 +79,35 @@
                __U);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_undefined_epi32 (),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_setzero_si512(),
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) __W,
+                 (__mmask64) __M);
+}
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vbmivlintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vbmivlintrin.h
index 04b7a47..5798969 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vbmivlintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vbmivlintrin.h
@@ -126,6 +126,62 @@
                __U);
 }
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_undefined_si128 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_setzero_si128 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) __W,
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_undefined_si256 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_setzero_si256 (),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) __W,
+                 (__mmask32) __M);
+}
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlbwintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlbwintrin.h
index bb69b1b..d8e67fc 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlbwintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlbwintrin.h
@@ -2847,9 +2847,6 @@
               (__mmask16)( __U));\
 })
 
-
-
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
 {
@@ -3020,6 +3017,441 @@
                  _mm256_setzero_si256 (),
                  (__mmask32) __U);
 }
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedquhi128_mask ((__v8hi *) __P,
+             (__v8hi) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
+{
+  __builtin_ia32_storedquhi256_mask ((__v16hi *) __P,
+             (__v16hi) __A,
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
+{
+  __builtin_ia32_storedquqi128_mask ((__v16qi *) __P,
+             (__v16qi) __A,
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
+{
+  __builtin_ia32_storedquqi256_mask ((__v32qi *) __P,
+             (__v32qi) __A,
+             (__mmask32) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_test_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+            (__v16qi) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+            (__v16qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_test_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+            (__v32qi) __B,
+            (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+            (__v32qi) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                 (__v8hi) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                 (__v8hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_test_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+            (__v16hi) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+            (__v16hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_testn_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+             (__v16qi) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+             (__v16qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+             (__v32qi) __B,
+             (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+             (__v32qi) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+            (__v8hi) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+            (__v8hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+             (__v16hi) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+             (__v16hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_movepi8_mask (__m128i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_movepi8_mask (__m256i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi16_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_movepi16_mask (__m256i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi8 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi8 (__mmask32 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi16 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi16 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A,
+                   (__v16qi) __O,
+                   __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A,
+                   (__v16qi) _mm_setzero_si128 (),
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A,
+                   (__v32qi) __O,
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A,
+                   (__v32qi) _mm256_setzero_si256 (),
+                   __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A,
+                   (__v8hi) __O,
+                   __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A,
+                   (__v8hi) _mm_setzero_si128 (),
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A,
+                   (__v16hi) __O,
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A,
+                   (__v16hi) _mm256_setzero_si256 (),
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                 (__v16hi) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                 (__v16hi) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                 (__v8hi) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                 (__v8hi) _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) _mm_undefined_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) _mm_setzero_si128 (),
+                 (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) __W,
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) _mm256_undefined_si256 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) _mm256_setzero_si256 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) __W,
+                 (__mmask16) __M);
+}
+
+#define _mm_mask_alignr_epi8( __W, __U, __A, __B, __N) __extension__ ({ \
+__builtin_ia32_palignr128_mask ((__v2di)( __A),\
+               (__v2di)( __B),\
+               ( __N) * 8,\
+               (__v2di)( __W),\
+               (__mmask16)( __U));\
+})
+
+#define _mm_maskz_alignr_epi8( __U, __A, __B, __N) __extension__ ({ \
+__builtin_ia32_palignr128_mask ((__v2di)( __A),\
+               (__v2di)( __B),\
+               ( __N) * 8,\
+               (__v2di)\
+               _mm_setzero_si128 (),\
+               (__mmask16)( __U));\
+})
+
+#define _mm256_mask_alignr_epi8( __W, __U, __A, __B, __N) __extension__ ({ \
+__builtin_ia32_palignr256_mask ((__v4di)( __A),\
+               (__v4di)( __B),\
+               ( __N) * 8,\
+               (__v4di)( __W),\
+               (__mmask32)( __U));\
+})
+
+#define _mm256_maskz_alignr_epi8( __U, __A, __B, __N) __extension__ ({ \
+__builtin_ia32_palignr256_mask ((__v4di)( __A),\
+               (__v4di)( __B),\
+               ( __N) * 8,\
+               (__v4di)\
+               _mm256_setzero_si256 (),\
+               (__mmask32)( __U));\
+})
+
+#define _mm_dbsad_epu8( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_dbpsadbw128_mask ((__v16qi)( __A),\
+                (__v16qi)( __B),\
+                ( __imm),\
+                (__v8hi) _mm_setzero_hi (),\
+                (__mmask8) -1);\
+})
+
+#define _mm_mask_dbsad_epu8( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_dbpsadbw128_mask ((__v16qi)( __A),\
+                (__v16qi)( __B),\
+                ( __imm),\
+                (__v8hi)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm_maskz_dbsad_epu8( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_dbpsadbw128_mask ((__v16qi)( __A),\
+                (__v16qi)( __B),\
+                ( __imm),\
+                (__v8hi) _mm_setzero_si128 (),\
+                (__mmask8)( __U));\
+})
+
+#define _mm256_dbsad_epu8( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_dbpsadbw256_mask ((__v32qi)( __A),\
+                (__v32qi)( __B),\
+                ( __imm),\
+                (__v16hi) _mm256_setzero_si256 (),\
+                (__mmask16) -1);\
+})
+
+#define _mm256_mask_dbsad_epu8( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_dbpsadbw256_mask ((__v32qi)( __A),\
+                (__v32qi)( __B),\
+                ( __imm),\
+                (__v16hi)( __W),\
+                (__mmask16)( __U));\
+})
+
+#define _mm256_maskz_dbsad_epu8( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_dbpsadbw256_mask ((__v32qi)( __A),\
+                (__v32qi)( __B),\
+                ( __imm),\
+                (__v16hi) _mm256_setzero_si256 (),\
+                (__mmask16)( __U));\
+})
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif /* __AVX512VLBWINTRIN_H */
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlcdintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlcdintrin.h
new file mode 100644
index 0000000..b372da2
--- /dev/null
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlcdintrin.h
@@ -0,0 +1,159 @@
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLCDINTRIN_H
+#define __AVX512VLCDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di)
+               _mm_setzero_di (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di)  _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLCDINTRIN_H */
\ No newline at end of file
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vldqintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vldqintrin.h
index dfd858e..3c28df8 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vldqintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vldqintrin.h
@@ -948,6 +948,308 @@
   (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
                 (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
 
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi32_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_movepi32_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi32 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi32 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi64_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_movepi64_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_f32x2 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf)_mm256_undefined_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf) __O,
+                __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_f64x2 (__m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df)_mm256_undefined_pd(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df) __O, 
+                 __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df) _mm256_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcast_i32x2 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si)_mm_undefined_si128(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si) _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i32x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si)_mm256_undefined_si256(),
+                 (__mmask8) - 1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i64x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di)_mm256_undefined_si256(),
+                 (__mmask8) - 1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di) __O, 
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di) _mm256_setzero_si256 (),
+                 __M);
+}
+
+#define _mm256_extractf64x2_pd( __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf64x2_256_mask ((__v4df)( __A),\
+               ( __imm),\
+               (__v2df) _mm_setzero_pd (),\
+               (__mmask8) -1);\
+})
+
+#define _mm256_mask_extractf64x2_pd( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf64x2_256_mask ((__v4df)( __A),\
+               ( __imm),\
+               (__v2df)( __W),\
+               (__mmask8) ( __U));\
+})
+
+#define _mm256_maskz_extractf64x2_pd( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf64x2_256_mask ((__v4df)( __A),\
+               ( __imm),\
+               (__v2df) _mm_setzero_pd (),\
+               (__mmask8) ( __U));\
+})
+
+#define _mm256_extracti64x2_epi64( __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x2_256_mask ((__v4di)( __A),\
+               ( __imm),\
+               (__v2di) _mm_setzero_di (),\
+               (__mmask8) -1);\
+})
+
+#define _mm256_mask_extracti64x2_epi64( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x2_256_mask ((__v4di)( __A),\
+               ( __imm),\
+               (__v2di)( __W),\
+               (__mmask8) ( __U));\
+})
+
+#define _mm256_maskz_extracti64x2_epi64( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti64x2_256_mask ((__v4di)( __A),\
+               ( __imm),\
+               (__v2di) _mm_setzero_di (),\
+               (__mmask8) ( __U));\
+})
+
+#define _mm256_insertf64x2( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x2_256_mask ((__v4df)( __A),\
+              (__v2df)( __B),\
+              ( __imm),\
+              (__v4df) _mm256_setzero_pd (),\
+              (__mmask8) -1);\
+})
+
+#define _mm256_mask_insertf64x2( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x2_256_mask ((__v4df)( __A),\
+              (__v2df)( __B),\
+              ( __imm),\
+              (__v4df)( __W),\
+              (__mmask8) ( __U));\
+})
+
+#define _mm256_maskz_insertf64x2( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf64x2_256_mask ((__v4df)( __A),\
+              (__v2df)( __B),\
+              ( __imm),\
+              (__v4df) _mm256_setzero_pd (),\
+              (__mmask8) ( __U));\
+})
+
+#define _mm256_inserti64x2( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x2_256_mask ((__v4di)( __A),\
+              (__v2di)( __B),\
+              ( __imm),\
+              (__v4di) _mm256_setzero_si256 (),\
+              (__mmask8) -1);\
+})
+
+#define _mm256_mask_inserti64x2( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x2_256_mask ((__v4di)( __A),\
+              (__v2di)( __B),\
+              ( __imm),\
+              (__v4di)( __W),\
+              (__mmask8) ( __U));\
+})
+
+#define _mm256_maskz_inserti64x2( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti64x2_256_mask ((__v4di)( __A),\
+              (__v2di)( __B),\
+              ( __imm),\
+              (__v4di) _mm256_setzero_si256 (),\
+              (__mmask8) ( __U));\
+})
+
+#define _mm_mask_fpclass_pd_mask( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclasspd128_mask ((__v2df)( __A),\
+                 ( __imm),( __U));\
+})
+
+#define _mm_fpclass_pd_mask( __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclasspd128_mask ((__v2df)( __A),\
+                  ( __imm),\
+                  (__mmask8) -1);\
+})
+
+#define _mm256_mask_fpclass_pd_mask( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclasspd256_mask ((__v4df)( __A),\
+                  ( __imm),( __U));\
+})
+
+#define _mm256_fpclass_pd_mask( __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclasspd256_mask ((__v4df)( __A),\
+                  ( __imm),\
+                  (__mmask8) -1);\
+})
+
+#define _mm_mask_fpclass_ps_mask( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclassps128_mask ((__v4sf)( __A),\
+                  ( __imm),( __U));\
+})
+
+#define _mm_fpclass_ps_mask( __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclassps128_mask ((__v4sf)( __A),\
+                  ( __imm),\
+                  (__mmask8) -1);\
+})
+
+#define _mm256_mask_fpclass_ps_mask( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclassps256_mask ((__v8sf)( __A),\
+                  ( __imm),( __U));\
+})
+
+#define _mm256_fpclass_ps_mask( __A, __imm) __extension__ ({ \
+__builtin_ia32_fpclassps256_mask ((__v8sf)( __A),\
+                  ( __imm),\
+                  (__mmask8) -1);\
+})
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlintrin.h
index 6080a6d..dc6e0ba 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avx512vlintrin.h
@@ -5548,14 +5548,6 @@
              (__mmask8) __U);
 }
 
-#define _mm256_maskz_sllv_epi32( __U, __X, __Y) __extension__ ({ \
-__builtin_ia32_psllv8si_mask ((__v8si)( __X),\
-             (__v8si)( __Y),\
-             (__v8si)\
-             _mm256_setzero_si256 (),\
-             (__mmask8)( __U));\
-})
-
 
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -6368,6 +6360,3159 @@
               _mm256_setzero_ps (),
               (__mmask8) __U);
 }
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeapd128_mask ((__v2df *) __P,
+           (__v2df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeapd256_mask ((__v4df *) __P,
+           (__v4df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
+           (__v4sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
+           (__v8sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqudi128_mask ((__v2di *) __P,
+             (__v2di) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqudi256_mask ((__v4di *) __P,
+             (__v4di) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqusi128_mask ((__v4si *) __P,
+             (__v4si) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqusi256_mask ((__v8si *) __P,
+             (__v8si) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeupd128_mask ((__v2df *) __P,
+           (__v2df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeupd256_mask ((__v4df *) __P,
+           (__v4df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeups128_mask ((__v4sf *) __P,
+           (__v4sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeups256_mask ((__v8sf *) __P,
+           (__v8sf) __A,
+           (__mmask8) __U);
+}
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A,
+          __m128d __B)
+{
+  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A,
+       __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A,
+       __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A,
+          __m128d __B)
+{
+  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A,
+       __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A,
+       __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rcp14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_rcp14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+
+#define _mm_mask_permute_pd( __W, __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilpd_mask ((__v2df)( __X),( __C),\
+              (__v2df)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_maskz_permute_pd( __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilpd_mask ((__v2df)( __X),( __C),\
+              (__v2df)\
+              _mm_setzero_pd (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_mask_permute_pd( __W, __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilpd256_mask ((__v4df)( __X),( __C),\
+                 (__v4df)( __W),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_permute_pd( __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilpd256_mask ((__v4df)( __X),( __C),\
+                 (__v4df)\
+                 _mm256_setzero_pd (),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm_mask_permute_ps( __W, __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilps_mask ((__v4sf)( __X),( __C),\
+             (__v4sf)( __W),\
+             (__mmask8)( __U));\
+})
+
+#define _mm_maskz_permute_ps( __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilps_mask ((__v4sf)( __X),( __C),\
+             (__v4sf)\
+             _mm_setzero_ps (),\
+             (__mmask8)( __U));\
+})
+
+#define _mm256_mask_permute_ps( __W, __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilps256_mask ((__v8sf)( __X),( __C),\
+                (__v8sf)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_permute_ps( __U, __X, __C) __extension__ ({ \
+__builtin_ia32_vpermilps256_mask ((__v8sf)( __X),( __C),\
+                (__v8sf)\
+                _mm256_setzero_ps (),\
+                (__mmask8)( __U));\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
+      __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+                 (__v2di) __C,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+                 (__v2di) __C,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
+         __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+              (__v4di) __C,
+              (__v4df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+              (__v4di) __C,
+              (__v4df)
+              _mm256_setzero_pd (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
+      __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+                (__v4si) __C,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+                (__v4si) __C,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
+         __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+                   (__v8si) __C,
+                   (__v8sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+                   (__v8si) __C,
+                   (__v8sf)
+                   _mm256_setzero_ps (),
+                   (__mmask8) __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                 (__v4si) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                 (__v4si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_test_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                 (__v8si) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                 (__v8si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                 (__v2di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                 (__v2di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_test_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                 (__v4di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                 (__v4di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+            (__v4si) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+            (__v4si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+            (__v8si) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+            (__v8si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+            (__v2di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+            (__v2di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+            (__v4di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+            (__v4di) __B, __U);
+}
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A,
+                 (__v4si) __B,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A,
+                 (__v4si) __B,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A,
+                 (__v8si) __B,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A,
+                 (__v8si) __B,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A,
+                  (__v2di) __B,
+                  (__v2di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A,
+                  (__v2di) __B,
+                  (__v2di)
+                  _mm_setzero_di (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A,
+                  (__v4di) __B,
+                  (__v4di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A,
+                  (__v4di) __B,
+                  (__v4di)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A,
+                 (__v4si) __B,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A,
+                 (__v4si) __B,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A,
+                 (__v8si) __B,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A,
+                 (__v8si) __B,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A,
+                  (__v2di) __B,
+                  (__v2di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A,
+                  (__v2di) __B,
+                  (__v2di)
+                  _mm_setzero_di (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A,
+                  (__v4di) __B,
+                  (__v4di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A,
+                  (__v4di) __B,
+                  (__v4di)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+             (__v4si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+             (__v4si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+#define _mm_mask_srai_epi32( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_psradi128_mask ((__v4si)( __A),( __imm),\
+              (__v4si)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_maskz_srai_epi32( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_psradi128_mask ((__v4si)( __A),( __imm),\
+              (__v4si)\
+              _mm_setzero_si128 (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_mask_srai_epi32( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_psradi256_mask ((__v8si)( __A),( __imm),\
+              (__v8si)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_srai_epi32( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_psradi256_mask ((__v8si)( __A),( __imm),\
+              (__v8si)\
+              _mm256_setzero_si256 (),\
+              (__mmask8)( __U));\
+})
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_di (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_di (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+             (__v2di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+#define _mm_srai_epi64( __A, __imm) __extension__ ({ \
+__builtin_ia32_psraqi128_mask ((__v2di)( __A),( __imm),\
+              (__v2di)\
+              _mm_setzero_di (),\
+              (__mmask8) -1);\
+})
+
+#define _mm_mask_srai_epi64( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_psraqi128_mask ((__v2di)( __A),( __imm),\
+              (__v2di)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_maskz_srai_epi64( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_psraqi128_mask ((__v2di)( __A),( __imm),\
+              (__v2di)\
+              _mm_setzero_si128 (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_srai_epi64( __A, __imm) __extension__ ({ \
+__builtin_ia32_psraqi256_mask ((__v4di)( __A),( __imm),\
+              (__v4di)\
+              _mm256_setzero_si256 (),\
+              (__mmask8) -1);\
+})
+
+#define _mm256_mask_srai_epi64( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_psraqi256_mask ((__v4di)( __A),( __imm),\
+              (__v4di)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_srai_epi64( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_psraqi256_mask ((__v4di)( __A),( __imm),\
+              (__v4di)\
+              _mm256_setzero_si256 (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_ternarylogic_epi32( __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd128_mask ((__v4si)( __A),\
+                 (__v4si)( __B),\
+                 (__v4si)( __C),( imm),\
+                 (__mmask8) -1);\
+})
+
+#define _mm_mask_ternarylogic_epi32( __A, __U, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd128_mask ((__v4si)( __A),\
+                 (__v4si)( __B),\
+                 (__v4si)( __C),( imm),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm_maskz_ternarylogic_epi32( __U, __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd128_maskz ((__v4si)( __A),\
+                  (__v4si)( __B),\
+                  (__v4si)( __C),\
+                 ( imm),\
+                  (__mmask8)( __U));\
+})
+
+#define _mm256_ternarylogic_epi32( __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd256_mask ((__v8si)( __A),\
+                 (__v8si)( __B),\
+                 (__v8si)( __C),( imm),\
+                 (__mmask8) -1);\
+})
+
+#define _mm256_mask_ternarylogic_epi32( __A, __U, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd256_mask ((__v8si)( __A),\
+                 (__v8si)( __B),\
+                 (__v8si)( __C),( imm),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_ternarylogic_epi32( __U, __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogd256_maskz ((__v8si)( __A),\
+                  (__v8si)( __B),\
+                  (__v8si)( __C),\
+                 ( imm),\
+                  (__mmask8)( __U));\
+})
+
+#define _mm_ternarylogic_epi64( __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq128_mask ((__v2di)( __A),\
+                 (__v2di)( __B),\
+                 (__v2di)( __C),( imm),\
+                 (__mmask8) -1);\
+})
+
+#define _mm_mask_ternarylogic_epi64( __A, __U, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq128_mask ((__v2di)( __A),\
+                 (__v2di)( __B),\
+                 (__v2di)( __C),( imm),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm_maskz_ternarylogic_epi64( __U, __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq128_maskz ((__v2di)( __A),\
+                  (__v2di)( __B),\
+                  (__v2di)( __C),\
+                 ( imm),\
+                  (__mmask8)( __U));\
+})
+
+#define _mm256_ternarylogic_epi64( __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq256_mask ((__v4di)( __A),\
+                 (__v4di)( __B),\
+                 (__v4di)( __C),( imm),\
+                 (__mmask8) -1);\
+})
+
+#define _mm256_mask_ternarylogic_epi64( __A, __U, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq256_mask ((__v4di)( __A),\
+                 (__v4di)( __B),\
+                 (__v4di)( __C),( imm),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_ternarylogic_epi64( __U, __A, __B, __C, imm) __extension__ ({ \
+__builtin_ia32_pternlogq256_maskz ((__v4di)( __A),\
+                  (__v4di)( __B),\
+                  (__v4di)( __C),\
+                 ( imm),\
+                  (__mmask8)( __U));\
+})
+
+
+
+#define _mm256_shuffle_f32x4( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f32x4_256_mask ((__v8sf)( __A),\
+                  (__v8sf)( __B),\
+                 ( __imm),\
+                  (__v8sf)\
+                  _mm256_setzero_ps (),\
+                  (__mmask8) -1);\
+})
+
+#define _mm256_mask_shuffle_f32x4( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f32x4_256_mask ((__v8sf)( __A),\
+                  (__v8sf)( __B),\
+                 ( __imm),\
+                  (__v8sf)( __W),\
+                  (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_shuffle_f32x4( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f32x4_256_mask ((__v8sf)( __A),\
+                  (__v8sf)( __B),\
+                 ( __imm),\
+                  (__v8sf)\
+                  _mm256_setzero_ps (),\
+                  (__mmask8)( __U));\
+})
+
+#define _mm256_shuffle_f64x2( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f64x2_256_mask ((__v4df)( __A),\
+                   (__v4df)( __B),\
+                  ( __imm),\
+                   (__v4df)\
+                   _mm256_setzero_pd (),\
+                   (__mmask8) -1);\
+})
+
+#define _mm256_mask_shuffle_f64x2( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f64x2_256_mask ((__v4df)( __A),\
+                   (__v4df)( __B),\
+                  ( __imm),\
+                   (__v4df)( __W),\
+                   (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_shuffle_f64x2( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_f64x2_256_mask ((__v4df)( __A),\
+                   (__v4df)( __B),\
+                  ( __imm),\
+                   (__v4df)\
+                   _mm256_setzero_pd (),\
+                   (__mmask8)( __U));\
+})
+
+#define _mm256_shuffle_i32x4( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i32x4_256_mask ((__v8si)( __A),\
+                   (__v8si)( __B),\
+                  ( __imm),\
+                   (__v8si)\
+                   _mm256_setzero_si256 (),\
+                   (__mmask8) -1);\
+})
+
+#define _mm256_mask_shuffle_i32x4( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i32x4_256_mask ((__v8si)( __A),\
+                   (__v8si)( __B),\
+                  ( __imm),\
+                   (__v8si)( __W),\
+                   (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_shuffle_i32x4( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i32x4_256_mask ((__v8si)( __A),\
+                   (__v8si)( __B),\
+                  ( __imm),\
+                   (__v8si)\
+                   _mm256_setzero_si256 (),\
+                   (__mmask8)( __U));\
+})
+
+#define _mm256_shuffle_i64x2( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i64x2_256_mask ((__v4di)( __A),\
+                   (__v4di)( __B),\
+                  ( __imm),\
+                   (__v4di)\
+                   _mm256_setzero_si256 (),\
+                   (__mmask8) -1);\
+})
+
+#define _mm256_mask_shuffle_i64x2( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i64x2_256_mask ((__v4di)( __A),\
+                   (__v4di)( __B),\
+                  ( __imm),\
+                   (__v4di)( __W),\
+                   (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_shuffle_i64x2( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shuf_i64x2_256_mask ((__v4di)( __A),\
+                   (__v4di)( __B),\
+                  ( __imm),\
+                   (__v4di)\
+                   _mm256_setzero_si256 (),\
+                   (__mmask8)( __U));\
+})
+
+#define _mm_mask_shuffle_pd( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shufpd128_mask ((__v2df)( __A),\
+              (__v2df)( __B),( __imm),\
+              (__v2df)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_maskz_shuffle_pd( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shufpd128_mask ((__v2df)( __A),\
+              (__v2df)( __B),( __imm),\
+              (__v2df)\
+              _mm_setzero_pd (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_mask_shuffle_pd( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shufpd256_mask ((__v4df)( __A),\
+              (__v4df)( __B),( __imm),\
+              (__v4df)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_shuffle_pd( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shufpd256_mask ((__v4df)( __A),\
+              (__v4df)( __B),( __imm),\
+              (__v4df)\
+              _mm256_setzero_pd (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_mask_shuffle_ps( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shufps128_mask ((__v4sf)( __A),\
+             (__v4sf)( __B),( __imm),\
+             (__v4sf)( __W),\
+             (__mmask8)( __U));\
+})
+
+#define _mm_maskz_shuffle_ps( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shufps128_mask ((__v4sf)( __A),\
+             (__v4sf)( __B),( __imm),\
+             (__v4sf)\
+             _mm_setzero_ps (),\
+             (__mmask8)( __U));\
+})
+
+#define _mm256_mask_shuffle_ps( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shufps256_mask ((__v8sf)( __A),\
+             (__v8sf)( __B),( __imm),\
+             (__v8sf)( __W),\
+             (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_shuffle_ps( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_shufps256_mask ((__v8sf)( __A),\
+             (__v8sf)( __B),( __imm),\
+             (__v8sf)\
+             _mm256_setzero_ps (),\
+             (__mmask8)( __U));\
+})
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rsqrt14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_rsqrt14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_f32x4 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf)_mm256_undefined_pd (),
+                (__mmask8) - 1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf) __O,
+                __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i32x4 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
+                 (__v8si)_mm256_undefined_si256 (),
+                 (__mmask8) - 1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
+                 (__v8si)
+                 __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+                 __A,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A,
+                   (__v4df) __O,
+                   __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A,
+                   (__v4df) _mm256_setzero_pd (),
+                   __M);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A,
+                  (__v4sf) __O,
+                  __M);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A,
+                  (__v4sf) _mm_setzero_ps (),
+                  __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A,
+                  (__v8sf) __O,
+                  __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A,
+                  (__v8sf) _mm256_setzero_ps (),
+                  __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A,
+                   (__v4si) __O,
+                   __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A,
+                   (__v4si) _mm_setzero_si128 (),
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A,
+                   (__v8si) __O,
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A,
+                   (__v8si) _mm256_setzero_si256 (),
+                   __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A,
+                   (__v2di) __O,
+                   __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A,
+                   (__v2di) _mm_setzero_si128 (),
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A,
+                   (__v4di) __O,
+                   __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A,
+                   (__v4di) _mm256_setzero_si256 (),
+                   __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi)_mm_setzero_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi)__O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si)__O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) _mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  return __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi)_mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi)__O,
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+#define _mm256_extractf32x4_ps( __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf32x4_256_mask ((__v8sf)( __A),\
+              ( __imm),\
+              (__v4sf) _mm_setzero_ps (),\
+              (__mmask8) -1);\
+})
+
+#define _mm256_mask_extractf32x4_ps( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf32x4_256_mask ((__v8sf)( __A),\
+                ( __imm),\
+                (__v4sf)( __W),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm256_maskz_extractf32x4_ps( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extractf32x4_256_mask ((__v8sf)( __A),\
+                ( __imm),\
+                (__v4sf) _mm_setzero_ps (),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm256_extracti32x4_epi32( __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x4_256_mask ((__v8si)( __A),\
+                ( __imm),\
+                (__v4si) _mm_setzero_si128 (),\
+                (__mmask8) -1);\
+})
+
+#define _mm256_mask_extracti32x4_epi32( __W, __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x4_256_mask ((__v8si)( __A),\
+                ( __imm),\
+                (__v4si)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_extracti32x4_epi32( __U, __A, __imm) __extension__ ({ \
+__builtin_ia32_extracti32x4_256_mask ((__v8si)( __A),\
+               ( __imm),\
+               (__v4si) _mm_setzero_si128 (),\
+               (__mmask8) ( __U));\
+})
+
+#define _mm256_insertf32x4( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf32x4_256_mask ((__v8sf)( __A),\
+                (__v4sf)( __B),\
+                ( __imm),\
+                (__v8sf) _mm256_setzero_ps (),\
+                (__mmask8) -1);\
+})
+
+#define _mm256_mask_insertf32x4( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf32x4_256_mask ((__v8sf)( __A),\
+                (__v4sf)( __B),\
+                ( __imm),\
+                (__v8sf)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_insertf32x4( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_insertf32x4_256_mask ((__v8sf)( __A),\
+                (__v4sf)( __B),\
+                ( __imm),\
+                (__v8sf) _mm256_setzero_ps (),\
+                (__mmask8)( __U));\
+})
+
+#define _mm256_inserti32x4( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti32x4_256_mask ((__v8si)( __A),\
+                (__v4si)( __B),\
+                ( __imm),\
+                (__v8si) _mm256_setzero_si256 (),\
+                (__mmask8) -1);\
+})
+
+#define _mm256_mask_inserti32x4( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti32x4_256_mask ((__v8si)( __A),\
+                (__v4si)( __B),\
+                ( __imm),\
+                (__v8si)( __W),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm256_maskz_inserti32x4( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_inserti32x4_256_mask ((__v8si)( __A),\
+                (__v4si)( __B),\
+                ( __imm),\
+                (__v8si) _mm256_setzero_si256 (),\
+                (__mmask8) ( __U));\
+})
+
+#define _mm_getmant_pd( __A, __B, __C) __extension__({\
+__builtin_ia32_getmantpd128_mask ((__v2df) __A,\
+                 (__C << 2) | __B,\
+                 (__v2df) _mm_setzero_pd (),\
+                 (__mmask8) -1);\
+})
+
+#define _mm_mask_getmant_pd(  __W,  __U, __A, __B, __C) __extension__({\
+__builtin_ia32_getmantpd128_mask ((__v2df) __A,\
+                 (__C << 2) | __B,\
+                 (__v2df) __W,\
+                 (__mmask8) __U);\
+})
+
+#define _mm_maskz_getmant_pd( __U, __A, __B, __C) __extension__({\
+__builtin_ia32_getmantpd128_mask ((__v2df) __A,\
+                 (__C << 2) | __B,\
+                 (__v2df) _mm_setzero_pd (),\
+                 (__mmask8) __U);\
+})
+
+#define _mm256_getmant_pd( __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantpd256_mask ((__v4df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v4df) _mm256_setzero_pd (),\
+                 (__mmask8) -1);\
+})
+
+#define _mm256_mask_getmant_pd( __W, __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantpd256_mask ((__v4df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v4df)( __W),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_getmant_pd( __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantpd256_mask ((__v4df)( __A),\
+                 (__C << 2) |( __B),\
+                 (__v4df) _mm256_setzero_pd (),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm_getmant_ps( __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps128_mask ((__v4sf)( __A),\
+                (__C << 2) |( __B),\
+                (__v4sf) _mm_setzero_ps (),\
+                (__mmask8) -1);\
+})
+
+#define _mm_mask_getmant_ps( __W, __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps128_mask ((__v4sf)( __A),\
+                (__C << 2) |( __B),\
+                (__v4sf)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm_maskz_getmant_ps( __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps128_mask ((__v4sf)( __A),\
+                (__C << 2) |( __B),\
+                (__v4sf) _mm_setzero_ps (),\
+                (__mmask8)( __U));\
+})
+
+#define _mm256_getmant_ps( __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps256_mask ((__v8sf)( __A),\
+                (__C << 2) |( __B),\
+                (__v8sf) _mm256_setzero_ps (),\
+                (__mmask8) -1);\
+})
+
+#define _mm256_mask_getmant_ps( __W, __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps256_mask ((__v8sf)( __A),\
+                (__C << 2) |( __B),\
+                (__v8sf)( __W),\
+                (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_getmant_ps( __U, __A, __B, __C) __extension__ ({ \
+__builtin_ia32_getmantps256_mask ((__v8sf)( __A),\
+                (__C << 2) |( __B),\
+                (__v8sf) _mm256_setzero_ps (),\
+                (__mmask8)( __U));\
+})
+
+#define _mm_mmask_i64gather_pd( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3div2df ((__v2df) __v1_old, __addr, (__v2di) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm_mmask_i64gather_epi64( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3div2di ((__v2di) __v1_old, __addr, (__v2di) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mmask_i64gather_pd( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3div4df ((__v4df) __v1_old, __addr, (__v4di) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mmask_i64gather_epi64( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3div4di ((__v4di) __v1_old, __addr, (__v4di) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm_mmask_i64gather_ps( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3div4sf ((__v4sf) __v1_old, __addr, (__v2di) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm_mmask_i64gather_epi32( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3div4si ((__v4si) __v1_old, __addr, (__v2di) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mmask_i64gather_ps( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3div8sf ((__v4sf) __v1_old, __addr, (__v4di) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mmask_i64gather_epi32( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3div8si ((__v4si) __v1_old, __addr, (__v4di) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm_mmask_i32gather_pd( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3siv2df ((__v2df) __v1_old, __addr, (__v4si) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm_mmask_i32gather_epi64( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3siv2di ((__v2di) __v1_old, __addr, (__v4si) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mmask_i32gather_pd( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3siv4df ((__v4df) __v1_old, __addr, (__v4si) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mmask_i32gather_epi64( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3siv4di ((__v4di) __v1_old, __addr, (__v4si) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm_mmask_i32gather_ps( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3siv4sf ((__v4sf) __v1_old, __addr, (__v4si) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm_mmask_i32gather_epi32( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3siv4si ((__v4si) __v1_old, __addr, (__v4si) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mmask_i32gather_ps( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3siv8sf ((__v8sf) __v1_old, __addr, (__v8si) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mmask_i32gather_epi32( __v1_old, __mask, __index, __addr, __scale) __extension__ ({\
+__builtin_ia32_gather3siv8si ((__v8si) __v1_old, __addr, (__v8si) __index,\
+                               __mask, __scale);\
+})
+
+#define _mm256_mask_permutex_pd( __W, __U, __X, __imm) __extension__ ({ \
+__builtin_ia32_permdf256_mask ((__v4df)( __X),( __imm),\
+                 (__v4df)( __W),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_permutex_pd( __U, __X, __imm) __extension__ ({ \
+__builtin_ia32_permdf256_mask ((__v4df)( __X),( __imm),\
+                 (__v4df) _mm256_setzero_pd (),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm256_permutex_pd( __X, __M) __extension__ ({ \
+__builtin_ia32_permdf256_mask ((__v4df)( __X),( __M),\
+                 (__v4df) _mm256_undefined_pd (),\
+                 (__mmask8) -1);\
+})
+
+#define _mm256_mask_permutex_epi64( __W, __M, __X, __I) __extension__ ({ \
+__builtin_ia32_permdi256_mask ((__v4di)( __X),\
+                 ( __I),\
+                 (__v4di)( __W),\
+                 (__mmask8)( __M));\
+})
+
+#define _mm256_maskz_permutex_epi64( __M, __X, __I) __extension__ ({ \
+__builtin_ia32_permdi256_mask ((__v4di)( __X),\
+                 ( __I),\
+                 (__v4di) _mm256_setzero_si256 (),\
+                 (__mmask8)( __M));\
+})
+
+#define _mm256_permutex_epi64( __X, __I) __extension__ ({ \
+__builtin_ia32_permdi256_mask ((__v4di)( __X),\
+                 ( __I),\
+                 (__v4di) _mm256_undefined_si256 (),\
+                 (__mmask8) -1);\
+})
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) _mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
+          __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) _mm256_setzero_si256 (),
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) _mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
+             __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) __W,
+                 __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
+          __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutexvar_ps (__m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) _mm256_undefined_si256 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+             __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) __W,
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) _mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+#define _mm_alignr_epi32( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignd128_mask ((__v4si)( __A),\
+              (__v4si)( __B),( __imm),\
+              (__v4si) _mm_undefined_si128 (),\
+              (__mmask8) -1);\
+})
+
+#define _mm_mask_alignr_epi32( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignd128_mask ((__v4si)( __A),\
+              (__v4si)( __B),( __imm),\
+              (__v4si)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_maskz_alignr_epi32( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignd128_mask ((__v4si)( __A),\
+              (__v4si)( __B),( __imm),\
+              (__v4si) _mm_setzero_si128 (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_alignr_epi32( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignd256_mask ((__v8si)( __A),\
+              (__v8si)( __B),( __imm),\
+              (__v8si) _mm256_undefined_si256 (),\
+              (__mmask8) -1);\
+})
+
+#define _mm256_mask_alignr_epi32( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignd256_mask ((__v8si)( __A),\
+              (__v8si)( __B),( __imm),\
+              (__v8si)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_alignr_epi32( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignd256_mask ((__v8si)( __A),\
+              (__v8si)( __B),( __imm),\
+              (__v8si) _mm256_setzero_si256 (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_alignr_epi64( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignq128_mask ((__v2di)( __A),\
+              (__v2di)( __B),( __imm),\
+              (__v2di) _mm_setzero_di (),\
+              (__mmask8) -1);\
+})
+
+#define _mm_mask_alignr_epi64( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignq128_mask ((__v2di)( __A),\
+              (__v2di)( __B),( __imm),\
+              (__v2di)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_maskz_alignr_epi64( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignq128_mask ((__v2di)( __A),\
+              (__v2di)( __B),( __imm),\
+              (__v2di) _mm_setzero_di (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_alignr_epi64( __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignq256_mask ((__v4di)( __A),\
+              (__v4di)( __B),( __imm),\
+              (__v4di) _mm256_undefined_pd (),\
+              (__mmask8) -1);\
+})
+
+#define _mm256_mask_alignr_epi64( __W, __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignq256_mask ((__v4di)( __A),\
+              (__v4di)( __B),( __imm),\
+              (__v4di)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_alignr_epi64( __U, __A, __B, __imm) __extension__ ({ \
+__builtin_ia32_alignq256_mask ((__v4di)( __A),\
+              (__v4di)( __B),( __imm),\
+              (__v4di) _mm256_setzero_si256 (),\
+              (__mmask8)( __U));\
+})
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm256_mask_shuffle_epi32( __W, __U, __A, __I) __extension__({\
+__builtin_ia32_pshufd256_mask((__v8si) (__A), (__I),\
+              (__v8si) (__W), (__mmask8) __U);\
+})
+
+#define _mm256_maskz_shuffle_epi32( __U,  __A, __I) __extension__({\
+__builtin_ia32_pshufd256_mask((__v8si) (__A), (__I),\
+              (__v8si) _mm256_setzero_si256 (),\
+              (__mmask8) (__U));\
+})
+
+#define _mm_mask_shuffle_epi32( __W, __U, __A, __I) __extension__({\
+  __builtin_ia32_pshufd128_mask ((__v4si) (__A), (__I),\
+              (__v4si) (__W), (__mmask8) __U);\
+})
+
+#define _mm_maskz_shuffle_epi32( __U,  __A, __I) __extension__({\
+  __builtin_ia32_pshufd128_mask ((__v4si) (__A), (__I),\
+              (__v4si)\
+              _mm_setzero_si128 (),\
+              (__mmask8) (__U));\
+})
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A,
+              (__v2df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A,
+              (__v4df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A,
+              (__v4df)
+              _mm256_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A,
+             (__v8sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A,
+             (__v8sf)
+             _mm256_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+#define _mm_mask_cvtps_ph( __W, __U, __A, __I) __extension__ ({ \
+__builtin_ia32_vcvtps2ph_mask ((__v4sf)( __A),( __I),\
+              (__v8hi)( __W),\
+              (__mmask8)( __U));\
+})
+
+#define _mm_maskz_cvtps_ph( __U, __A, __I) __extension__ ({ \
+__builtin_ia32_vcvtps2ph_mask ((__v4sf)( __A),( __I),\
+              (__v8hi)\
+              _mm_setzero_si128 (),\
+              (__mmask8)( __U));\
+})
+
+#define _mm256_mask_cvtps_ph( __W, __U, __A, __I) __extension__ ({ \
+__builtin_ia32_vcvtps2ph256_mask ((__v8sf)( __A),( __I),\
+                 (__v8hi)( __W),\
+                 (__mmask8)( __U));\
+})
+
+#define _mm256_maskz_cvtps_ph( __U, __A, __I) __extension__ ({ \
+__builtin_ia32_vcvtps2ph256_mask ((__v8sf)( __A),( __I),\
+                 (__v8hi)\
+                 _mm_setzero_si128 (),\
+                 (__mmask8)( __U));\
+})
+
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_BOTH
 
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/emmintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/emmintrin.h
index cfc2c71..aba2438 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/emmintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/emmintrin.h
@@ -734,108 +734,348 @@
   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies the corresponding elements of two [8 x short] vectors and
+///    returns a vector containing the low-order 16 bits of each 32-bit product
+///    in the corresponding element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a * (__v8hi)__b);
 }
 
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
+///    of the two 64-bit integer vectors and returns the 64-bit unsigned
+///    product.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULUDQ instruction.
+///
+/// \param __a
+///    A 64-bit integer containing one of the source operands.
+/// \param __b
+///    A 64-bit integer containing one of the source operands.
+/// \returns A 64-bit integer vector containing the product of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mul_su32(__m64 __a, __m64 __b)
 {
   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
 }
 
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower
+///    bits of the corresponding elements of two [2 x i64] vectors, and returns
+///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
+///
+/// \param __a
+///    A [2 x i64] vector containing one of the source operands.
+/// \param __b
+///    A [2 x i64] vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the product of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mul_epu32(__m128i __a, __m128i __b)
 {
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Computes the absolute differences of corresponding 8-bit integer
+///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
+///    separately sums the second 8 absolute differences. Packss these two
+///    unsigned 16-bit integer sums into the upper and lower elements of a
+///    [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the sums of the sets of absolute
+///    differences between both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sad_epu8(__m128i __a, __m128i __b)
 {
   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts the corresponding 8-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a - (__v16qi)__b);
 }
 
+/// \brief Subtracts the corresponding 16-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a - (__v8hi)__b);
 }
 
+/// \brief Subtracts the corresponding 32-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a - (__v4si)__b);
 }
 
+/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
+///    difference to the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSUBQ instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the minuend.
+/// \param __b
+///    A 64-bit integer vector containing the subtrahend.
+/// \returns A 64-bit integer vector containing the difference of the values in
+///    the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_si64(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_psubq(__a, __b);
 }
 
+/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi64(__m128i __a, __m128i __b)
 {
   return __a - __b;
 }
 
+/// \brief Subtracts corresponding 8-bit signed integer values in the input and
+///    returns the differences in the corresponding bytes in the destination.
+///    Differences greater than 7Fh are saturated to 7Fh, and differences less
+///    than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts corresponding 16-bit signed integer values in the input and
+///    returns the differences in the corresponding bytes in the destination.
+///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
+///    than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
+///    and returns the differences in the corresponding bytes in the
+///    destination. Differences less than 00h are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+///    differences of the values in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
+///    and returns the differences in the corresponding bytes in the
+///    destination. Differences less than 0000h are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+///    differences of the values in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Performs a bitwise AND of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPAND / PAND instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise AND of the values
+///    in both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_and_si128(__m128i __a, __m128i __b)
 {
   return __a & __b;
 }
 
+/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
+///    one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
+///
+/// \param __a
+///    A 128-bit vector containing the left source operand. The one's complement
+///    of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector containing the right source operand.
+/// \returns A 128-bit integer vector containing the bitwise AND of the one's
+///    complement of the first operand and the values in the second operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_andnot_si128(__m128i __a, __m128i __b)
 {
   return ~__a & __b;
 }
-
+/// \brief Performs a bitwise OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPOR / POR instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise OR of the values
+///    in both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_or_si128(__m128i __a, __m128i __b)
 {
   return __a | __b;
 }
 
+/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
+///    values in both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_xor_si128(__m128i __a, __m128i __b)
 {
   return __a ^ __b;
 }
 
+/// \brief Left-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_slli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to left-shift
+///    operand a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
 #define _mm_slli_si128(a, imm) __extension__ ({                         \
   (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
                                    (__v16qi)(__m128i)(a),               \
@@ -859,66 +1099,217 @@
 #define _mm_bslli_si128(a, imm) \
   _mm_slli_si128((a), (imm))
 
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
 }
 
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
 }
 
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
 }
 
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi64(__m128i __a, int __count)
 {
   return __builtin_ia32_psllqi128(__a, __count);
 }
 
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi64(__m128i __a, __m128i __count)
 {
   return __builtin_ia32_psllq128(__a, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
 }
 
+/// \brief Right-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_srli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to right-shift operand
+///    a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 #define _mm_srli_si128(a, imm) __extension__ ({                          \
   (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
                                    (__v16qi)_mm_setzero_si128(),         \
@@ -942,60 +1333,191 @@
 #define _mm_bsrli_si128(a, imm) \
   _mm_srli_si128((a), (imm))
 
+/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
 }
 
+/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
 }
 
+/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
 }
 
+/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi64(__m128i __a, int __count)
 {
   return __builtin_ia32_psrlqi128(__a, __count);
 }
 
+/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi64(__m128i __a, __m128i __count)
 {
   return __builtin_ia32_psrlq128(__a, __count);
 }
 
+/// \brief Compares each of the corresponding 8-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false, FFh
+///    for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a == (__v16qi)__b);
 }
 
+/// \brief Compares each of the corresponding 16-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
+///    for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a == (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding 32-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false,
+///    FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a == (__v4si)__b);
 }
 
+/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are
+///    greater than those in the second operand. Each comparison yields 0h for
+///    false, FFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
 {
@@ -1004,30 +1526,100 @@
   return (__m128i)((__v16qs)__a > (__v16qs)__b);
 }
 
+/// \brief Compares each of the corresponding signed 16-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are greater than those in the second operand. Each comparison yields 0h
+///    for false, FFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a > (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding signed 32-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are greater than those in the second operand. Each comparison yields 0h
+///    for false, FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a > (__v4si)__b);
 }
 
+/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are less
+///    than those in the second operand. Each comparison yields 0h for false,
+///    FFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi8(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi8(__b, __a);
 }
 
+/// \brief Compares each of the corresponding signed 16-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are less than those in the second operand. Each comparison yields 0h for
+///    false, FFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi16(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi16(__b, __a);
 }
 
+/// \brief Compares each of the corresponding signed 32-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are less than those in the second operand. Each comparison yields 0h for
+///    false, FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi32(__m128i __a, __m128i __b)
 {
@@ -1035,6 +1627,23 @@
 }
 
 #ifdef __x86_64__
+/// \brief Converts a 64-bit signed integer value from the second operand into a
+///    double-precision value and returns it in the lower element of a [2 x
+///    double] vector; the upper element of the returned vector is copied from
+///    the upper element of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
+///    copied to the upper 64 bits of the destination.
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied from
+///    the upper 64 bits of the first operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtsi64_sd(__m128d __a, long long __b)
 {
@@ -1042,12 +1651,34 @@
   return __a;
 }
 
+/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+///    64-bit signed integer value, according to the current rounding mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 64-bit signed integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtsd_si64(__m128d __a)
 {
   return __builtin_ia32_cvtsd2si64(__a);
 }
 
+/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+///    64-bit signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 64-bit signed integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttsd_si64(__m128d __a)
 {
@@ -1055,24 +1686,63 @@
 }
 #endif
 
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtepi32_ps(__m128i __a)
 {
   return __builtin_ia32_cvtdq2ps((__v4si)__a);
 }
 
+/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit integer vector of [4 x i32] containing the converted
+///    values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtps_epi32(__m128 __a)
 {
   return (__m128i)__builtin_ia32_cvtps2dq(__a);
 }
 
+/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
+///    truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x i32] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttps_epi32(__m128 __a)
 {
   return (__m128i)__builtin_ia32_cvttps2dq(__a);
 }
 
+/// \brief Returns a vector of [4 x i32] where the lowest element is the input
+///    operand and the remaining elements are zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+///
+/// \param __a
+///    A 32-bit signed integer operand.
+/// \returns A 128-bit vector of [4 x i32].
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi32_si128(int __a)
 {
@@ -1080,6 +1750,16 @@
 }
 
 #ifdef __x86_64__
+/// \brief Returns a vector of [2 x i64] where the lower element is the input
+///    operand and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+///
+/// \param __a
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x i64] containing the converted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si128(long long __a)
 {
@@ -1087,6 +1767,17 @@
 }
 #endif
 
+/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
+///    32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+///
+/// \param __a
+///    A vector of [4 x i32]. The least significant 32 bits are moved to the
+///    destination.
+/// \returns A 32-bit signed integer containing the moved value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsi128_si32(__m128i __a)
 {
@@ -1095,6 +1786,17 @@
 }
 
 #ifdef __x86_64__
+/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
+///    64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+///
+/// \param __a
+///    A vector of [2 x i64]. The least significant 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit signed integer containing the moved value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtsi128_si64(__m128i __a)
 {
@@ -1102,12 +1804,32 @@
 }
 #endif
 
+/// \brief Moves packed integer values from an aligned 128-bit memory location
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
+///
+/// \param __p
+///    An aligned pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_load_si128(__m128i const *__p)
 {
   return *__p;
 }
 
+/// \brief Moves packed integer values from an unaligned 128-bit memory location
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadu_si128(__m128i const *__p)
 {
@@ -1117,6 +1839,18 @@
   return ((struct __loadu_si128*)__p)->__v;
 }
 
+/// \brief Returns a vector of [2 x i64] where the lower element is taken from
+///    the lower element of the operand, and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+///
+/// \param __p
+///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
+///    the destination.
+/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
+///    moved value. The higher order bits are cleared.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadl_epi64(__m128i const *__p)
 {
@@ -1126,66 +1860,270 @@
   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
 }
 
+/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
+///    This could be used as an argument to another intrinsic function where the
+///    argument is required but the value is not actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x i32] with unspecified content.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_undefined_si128()
 {
   return (__m128i)__builtin_ia32_undef128();
 }
 
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits of the
+///    destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits of the
+///    destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi64x(long long __q1, long long __q0)
 {
   return (__m128i){ __q0, __q1 };
 }
 
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits of the
+///    destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits of the
+///    destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi64(__m64 __q1, __m64 __q0)
 {
   return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
+/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i3
+///    A 32-bit integer value used to initialize bits [127:96] of the
+///    destination vector.
+/// \param __i2
+///    A 32-bit integer value used to initialize bits [95:64] of the destination
+///    vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of the destination
+///    vector.
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the destination
+///    vector.
+/// \returns An initialized 128-bit vector of [4 x i32] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
 {
   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
+/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w7
+///    A 16-bit integer value used to initialize bits [127:112] of the
+///    destination vector.
+/// \param __w6
+///    A 16-bit integer value used to initialize bits [111:96] of the
+///    destination vector.
+/// \param __w5
+///    A 16-bit integer value used to initialize bits [95:80] of the destination
+///    vector.
+/// \param __w4
+///    A 16-bit integer value used to initialize bits [79:64] of the destination
+///    vector.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of the destination
+///    vector.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of the destination
+///    vector.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of the destination
+///    vector.
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the destination
+///    vector.
+/// \returns An initialized 128-bit vector of [8 x i16] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
 {
   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
+/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
+///    the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b15
+///    Initializes bits [127:120] of the destination vector.
+/// \param __b14
+///    Initializes bits [119:112] of the destination vector.
+/// \param __b13
+///    Initializes bits [111:104] of the destination vector.
+/// \param __b12
+///    Initializes bits [103:96] of the destination vector.
+/// \param __b11
+///    Initializes bits [95:88] of the destination vector.
+/// \param __b10
+///    Initializes bits [87:80] of the destination vector.
+/// \param __b9
+///    Initializes bits [79:72] of the destination vector.
+/// \param __b8
+///    Initializes bits [71:64] of the destination vector.
+/// \param __b7
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b6
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b5
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b4
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b3
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b2
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b1
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b0
+///    Initializes bits [7:0] of the destination vector.
+/// \returns An initialized 128-bit vector of [16 x i8] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
 {
   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
+/// \brief Initializes both values in a 128-bit integer vector with the
+///    specified 64-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q
+///    Integer value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit integer vector of [2 x i64] with both
+///    elements containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64x(long long __q)
 {
   return (__m128i){ __q, __q };
 }
 
+/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
+///    specified 64-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q
+///    A 64-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [2 x i64] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64(__m64 __q)
 {
   return (__m128i){ (long long)__q, (long long)__q };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
+///    specified 32-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i
+///    A 32-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [4 x i32] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi32(int __i)
 {
   return (__m128i)(__v4si){ __i, __i, __i, __i };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
+///    specified 16-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w
+///    A 16-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [8 x i16] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi16(short __w)
 {
   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
+///    specified 8-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b
+///    An 8-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [16 x i8] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi8(char __b)
 {
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/immintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/immintrin.h
index ceaa103..a74dad8 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/immintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/immintrin.h
@@ -75,6 +75,8 @@
 
 #include <avx512vlbwintrin.h>
 
+#include <avx512vlcdintrin.h>
+
 #include <avx512vldqintrin.h>
 
 #include <avx512erintrin.h>
@@ -87,6 +89,8 @@
 
 #include <avx512vbmivlintrin.h>
 
+#include <avx512pfintrin.h>
+
 #include <pkuintrin.h>
 
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/module.modulemap b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/module.modulemap
index afb3865..4b2cb85 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/module.modulemap
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/module.modulemap
@@ -1,3 +1,26 @@
+/*===---- module.modulemap - intrinsics module map -------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
 module _Builtin_intrinsics [system] [extern_c] {
   explicit module altivec {
     requires altivec
@@ -24,7 +47,19 @@
     export *
 
     header "immintrin.h"
+    textual header "f16cintrin.h"
+    textual header "avxintrin.h"
+    textual header "avx2intrin.h"
+    textual header "avx512fintrin.h"
+    textual header "avx512erintrin.h"
+    textual header "fmaintrin.h"
+
     header "x86intrin.h"
+    textual header "bmiintrin.h"
+    textual header "bmi2intrin.h"
+    textual header "lzcntintrin.h"
+    textual header "xopintrin.h"
+    textual header "fma4intrin.h"
 
     explicit module mm_malloc {
       header "mm_malloc.h"
@@ -39,10 +74,6 @@
       header "mmintrin.h"
     }
 
-    explicit module f16c {
-      header "f16cintrin.h"
-    }
-
     explicit module sse {
       export mm_malloc
       export mmx
@@ -80,46 +111,6 @@
       header "ammintrin.h"
     }
 
-    explicit module avx {
-      export sse4_2
-      header "avxintrin.h"
-    }
-
-    explicit module avx2 {
-      export avx
-      header "avx2intrin.h"
-    }
-
-    explicit module avx512f {
-      export avx2
-      header "avx512fintrin.h"
-    }
-
-    explicit module avx512er {
-      header "avx512erintrin.h"
-    }
-
-    explicit module bmi {
-      header "bmiintrin.h"
-    }
-
-    explicit module bmi2 {
-      header "bmi2intrin.h"
-    }
-
-    explicit module fma {
-      header "fmaintrin.h"
-    }
-
-    explicit module fma4 {
-      export sse3
-      header "fma4intrin.h"
-    }
-
-    explicit module lzcnt {
-      header "lzcntintrin.h"
-    }
-
     explicit module popcnt {
       header "popcntintrin.h"
     }
@@ -128,11 +119,6 @@
       header "mm3dnow.h"
     }
 
-    explicit module xop {
-      export fma4
-      header "xopintrin.h"
-    }
-
     explicit module aes_pclmul {
       header "wmmintrin.h"
       export aes
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/xmmintrin.h b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/xmmintrin.h
index 2b34260..43f9422 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/xmmintrin.h
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/xmmintrin.h
@@ -1946,7 +1946,7 @@
 #undef __DEFAULT_FN_ATTRS
 
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
-#if defined(__SSE2__) && !__has_feature(modules)
+#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
 #include <emmintrin.h>
 #endif
 
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/lib/linux/libclang_rt.asan-x86_64.a.syms b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/lib/linux/libclang_rt.asan-x86_64.a.syms
index e79245b..c89e34c 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/lib/linux/libclang_rt.asan-x86_64.a.syms
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/lib/linux/libclang_rt.asan-x86_64.a.syms
@@ -22,6 +22,7 @@
   __interceptor___isoc99_vsscanf;
   __interceptor___libc_memalign;
   __interceptor___overflow;
+  __interceptor___strdup;
   __interceptor___tls_get_addr;
   __interceptor___uflow;
   __interceptor___underflow;
@@ -29,6 +30,7 @@
   __interceptor___wuflow;
   __interceptor___wunderflow;
   __interceptor___xpg_strerror_r;
+  __interceptor___xstat;
   __interceptor__exit;
   __interceptor__longjmp;
   __interceptor__obstack_begin;
@@ -1073,6 +1075,7 @@
   __sanitizer_unaligned_store64;
   __sanitizer_update_counter_bitset_and_clear_counters;
   __sanitizer_verify_contiguous_container;
+  __strdup;
   __tls_get_addr;
   __ubsan_*;
   __uflow;
@@ -1081,6 +1084,7 @@
   __wuflow;
   __wunderflow;
   __xpg_strerror_r;
+  __xstat;
   _exit;
   _longjmp;
   _obstack_begin;
diff --git a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/lib/linux/libclang_rt.tsan-x86_64.a.syms b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/lib/linux/libclang_rt.tsan-x86_64.a.syms
index 33a1e7d..dca5734 100644
--- a/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/lib/linux/libclang_rt.tsan-x86_64.a.syms
+++ b/third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/lib/linux/libclang_rt.tsan-x86_64.a.syms
@@ -350,7 +350,6 @@
   __interceptor_socketpair;
   __interceptor_sprintf;
   __interceptor_sscanf;
-  __interceptor_stat;
   __interceptor_stat64;
   __interceptor_statfs;
   __interceptor_statfs64;
@@ -1461,7 +1460,6 @@
   socketpair;
   sprintf;
   sscanf;
-  stat;
   stat64;
   statfs;
   statfs64;