Update clang-tools to ab/5067893

This commit updates the header-checker binary blobs with the one
built with clang-r339409b.

Bug: 111579848
Bug: 112949945
Test: development/vndk/tools/header-checker/utils/create_reference_dumps.py
Change-Id: Ib0dd0b6abe0b9c92dd27701dfe4ca3a95f597113
diff --git a/darwin-x86/clang-headers/cuda_builtin_vars.h b/darwin-x86/clang-headers/__clang_cuda_builtin_vars.h
similarity index 98%
rename from darwin-x86/clang-headers/cuda_builtin_vars.h
rename to darwin-x86/clang-headers/__clang_cuda_builtin_vars.h
index 6f5eb9c..290c4b2 100644
--- a/darwin-x86/clang-headers/cuda_builtin_vars.h
+++ b/darwin-x86/clang-headers/__clang_cuda_builtin_vars.h
@@ -54,7 +54,7 @@
 #define __DELETE
 #endif
 
-// Make sure nobody can create instances of the special varible types.  nvcc
+// Make sure nobody can create instances of the special variable types.  nvcc
 // also disallows taking address of special variables, so we disable address-of
 // operator as well.
 #define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
diff --git a/darwin-x86/clang-headers/__clang_cuda_cmath.h b/darwin-x86/clang-headers/__clang_cuda_cmath.h
index ae7ff2f..5331ba4 100644
--- a/darwin-x86/clang-headers/__clang_cuda_cmath.h
+++ b/darwin-x86/clang-headers/__clang_cuda_cmath.h
@@ -26,13 +26,15 @@
 #error "This file is for CUDA compilation only."
 #endif
 
+#include <limits>
+
 // CUDA lets us use various std math functions on the device side.  This file
 // works in concert with __clang_cuda_math_forward_declares.h to make this work.
 //
 // Specifically, the forward-declares header declares __device__ overloads for
 // these functions in the global namespace, then pulls them into namespace std
 // with 'using' statements.  Then this file implements those functions, after
-// the implementations have been pulled in.
+// their implementations have been pulled in.
 //
 // It's important that we declare the functions in the global namespace and pull
 // them into namespace std with using statements, as opposed to simply declaring
@@ -70,10 +72,21 @@
 __DEVICE__ float frexp(float __arg, int *__exp) {
   return ::frexpf(__arg, __exp);
 }
+
+// For inscrutable reasons, the CUDA headers define these functions for us on
+// Windows.
+#ifndef _MSC_VER
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
+// For inscrutable reasons, __finite(), the double-precision version of
+// __finitef, does not exist when compiling for MacOS.  __isfinited is available
+// everywhere and is just as good.
+__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+#endif
+
 __DEVICE__ bool isgreater(float __x, float __y) {
   return __builtin_isgreater(__x, __y);
 }
@@ -104,8 +117,6 @@
 __DEVICE__ bool islessgreater(double __x, double __y) {
   return __builtin_islessgreater(__x, __y);
 }
-__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
 __DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
 __DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
 __DEVICE__ bool isunordered(float __x, float __y) {
@@ -120,12 +131,6 @@
 __DEVICE__ float log(float __x) { return ::logf(__x); }
 __DEVICE__ float log10(float __x) { return ::log10f(__x); }
 __DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
-__DEVICE__ float nexttoward(float __from, float __to) {
-  return __builtin_nexttowardf(__from, __to);
-}
-__DEVICE__ double nexttoward(double __from, double __to) {
-  return __builtin_nexttoward(__from, __to);
-}
 __DEVICE__ float pow(float __base, float __exp) {
   return ::powf(__base, __exp);
 }
@@ -136,13 +141,332 @@
   return ::powi(__base, __iexp);
 }
 __DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
 __DEVICE__ float sin(float __x) { return ::sinf(__x); }
 __DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
 __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
 __DEVICE__ float tan(float __x) { return ::tanf(__x); }
 __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
 
+// Notably missing above is nexttoward.  We omit it because
+// libdevice doesn't provide an implementation, and we don't want to be in the
+// business of implementing tricky libm functions in this header.
+
+// Now we've defined everything we promised we'd define in
+// __clang_cuda_math_forward_declares.h.  We need to do two additional things to
+// fix up our math functions.
+//
+// 1) Define __device__ overloads for e.g. sin(int).  The CUDA headers define
+//    only sin(float) and sin(double), which means that e.g. sin(0) is
+//    ambiguous.
+//
+// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
+//    std.  These are defined in the CUDA headers in the global namespace,
+//    independent of everything else we've done here.
+
+// We can't use std::enable_if, because we want to be pre-C++11 compatible.  But
+// we go ahead and unconditionally define functions that are only available when
+// compiling for C++11 to match the behavior of the CUDA headers.
+template<bool __B, class __T = void>
+struct __clang_cuda_enable_if {};
+
+template <class __T> struct __clang_cuda_enable_if<true, __T> {
+  typedef __T type;
+};
+
+// Defines an overload of __fn that accepts one integral argument, calls
+// __fn((double)x), and returns __retty.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn)                      \
+  template <typename __T>                                                      \
+  __DEVICE__                                                                   \
+      typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+      __fn(__T __x) {                                                          \
+    return ::__fn((double)__x);                                                \
+  }
+
+// Defines an overload of __fn that accepts one two arithmetic arguments, calls
+// __fn((double)x, (double)y), and returns a double.
+//
+// Note this is different from OVERLOAD_1, which generates an overload that
+// accepts only *integral* arguments.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn)                      \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__ typename __clang_cuda_enable_if<                                  \
+      std::numeric_limits<__T1>::is_specialized &&                             \
+          std::numeric_limits<__T2>::is_specialized,                           \
+      __retty>::type                                                           \
+  __fn(__T1 __x, __T2 __y) {                                                   \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
+
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
+
+// Overloads for functions that don't match the patterns expected by
+// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized &&
+        std::numeric_limits<__T3>::is_specialized,
+    double>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
+  return std::fma((double)__x, (double)__y, (double)__z);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+frexp(__T __x, int *__exp) {
+  return std::frexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+ldexp(__T __x, int __exp) {
+  return std::ldexp((double)__x, __exp);
+}
+
+template <typename __T1, typename __T2>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized,
+    double>::type
+remquo(__T1 __x, __T2 __y, int *__quo) {
+  return std::remquo((double)__x, (double)__y, __quo);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbln(__T __x, long __exp) {
+  return std::scalbln((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbn(__T __x, int __exp) {
+  return std::scalbn((double)__x, __exp);
+}
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// Pull the new overloads we defined above into namespace std.
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::llround;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::nearbyint;
+using ::nextafter;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+// Well this is fun: We need to pull these symbols in for libc++, but we can't
+// pull them in with libstdc++, because its ::isinf and ::isnan are different
+// than its std::isinf and std::isnan.
+#ifndef __GLIBCXX__
+using ::isinf;
+using ::isnan;
+#endif
+
+// Finally, pull the "foobarf" functions that CUDA defines in its headers into
+// namespace std.
+using ::acosf;
+using ::acoshf;
+using ::asinf;
+using ::asinhf;
+using ::atan2f;
+using ::atanf;
+using ::atanhf;
+using ::cbrtf;
+using ::ceilf;
+using ::copysignf;
+using ::cosf;
+using ::coshf;
+using ::erfcf;
+using ::erff;
+using ::exp2f;
+using ::expf;
+using ::expm1f;
+using ::fabsf;
+using ::fdimf;
+using ::floorf;
+using ::fmaf;
+using ::fmaxf;
+using ::fminf;
+using ::fmodf;
+using ::frexpf;
+using ::hypotf;
+using ::ilogbf;
+using ::ldexpf;
+using ::lgammaf;
+using ::llrintf;
+using ::llroundf;
+using ::log10f;
+using ::log1pf;
+using ::log2f;
+using ::logbf;
+using ::logf;
+using ::lrintf;
+using ::lroundf;
+using ::modff;
+using ::nearbyintf;
+using ::nextafterf;
+using ::powf;
+using ::remainderf;
+using ::remquof;
+using ::rintf;
+using ::roundf;
+using ::scalblnf;
+using ::scalbnf;
+using ::sinf;
+using ::sinhf;
+using ::sqrtf;
+using ::tanf;
+using ::tanhf;
+using ::tgammaf;
+using ::truncf;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
 #undef __DEVICE__
 
 #endif
diff --git a/darwin-x86/clang-headers/__clang_cuda_complex_builtins.h b/darwin-x86/clang-headers/__clang_cuda_complex_builtins.h
new file mode 100644
index 0000000..beef7de
--- /dev/null
+++ b/darwin-x86/clang-headers/__clang_cuda_complex_builtins.h
@@ -0,0 +1,203 @@
+/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
+#define __CLANG_CUDA_COMPLEX_BUILTINS
+
+// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3.  These are
+// libgcc functions that clang assumes are available when compiling c99 complex
+// operations.  (These implementations come from libc++, and have been modified
+// to work with CUDA.)
+
+extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
+                                                      double __c, double __d) {
+  double __ac = __a * __c;
+  double __bd = __b * __d;
+  double __ad = __a * __d;
+  double __bc = __b * __c;
+  double _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
+      // a device overload (and isn't constexpr before C++11, naturally).
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  float __ac = __a * __c;
+  float __bd = __b * __d;
+  float __ad = __a * __d;
+  float __bc = __b * __c;
+  float _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
+                                                      double __c, double __d) {
+  int __ilogbw = 0;
+  // Can't use std::max, because that's defined in <algorithm>, and we don't
+  // want to pull that in for every compile.  The CUDA headers define
+  // ::max(float, float) and ::max(double, double), which is sufficient for us.
+  double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  double __denom = __c * __c + __d * __d;
+  double _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
+      __real__(z) = 0.0 * (__a * __c + __b * __d);
+      __imag__(z) = 0.0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  int __ilogbw = 0;
+  float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  float __denom = __c * __c + __d * __d;
+  float _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      __real__(z) = 0 * (__a * __c + __b * __d);
+      __imag__(z) = 0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+#endif // __CLANG_CUDA_COMPLEX_BUILTINS
diff --git a/darwin-x86/clang-headers/__clang_cuda_device_functions.h b/darwin-x86/clang-headers/__clang_cuda_device_functions.h
new file mode 100644
index 0000000..67bbc68
--- /dev/null
+++ b/darwin-x86/clang-headers/__clang_cuda_device_functions.h
@@ -0,0 +1,1768 @@
+/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
+#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__
+
+#if CUDA_VERSION < 9000
+#error This file is intended to be used with CUDA-9+ only.
+#endif
+
+// __DEVICE__ is a helper macro with common set of attributes for the wrappers
+// we implement in this file. We need static in order to avoid emitting unused
+// functions and __forceinline__ helps inlining these wrappers at -O1.
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__ static __device__ __forceinline__
+
+// libdevice provides fast low precision and slow full-recision implementations
+// for some functions. Which one gets selected depends on
+// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
+// -ffast-math or -fcuda-approx-transcendentals are in effect.
+#pragma push_macro("__FAST_OR_SLOW")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __FAST_OR_SLOW(fast, slow) fast
+#else
+#define __FAST_OR_SLOW(fast, slow) slow
+#endif
+
+__DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); }
+__DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); }
+__DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); }
+__DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); }
+__DEVICE__ unsigned long long __brevll(unsigned long long __a) {
+  return __nv_brevll(__a);
+}
+__DEVICE__ void __brkpt() { asm volatile("brkpt;"); }
+__DEVICE__ void __brkpt(int __a) { __brkpt(); }
+__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
+                                    unsigned int __c) {
+  return __nv_byte_perm(__a, __b, __c);
+}
+__DEVICE__ int __clz(int __a) { return __nv_clz(__a); }
+__DEVICE__ int __clzll(long long __a) { return __nv_clzll(__a); }
+__DEVICE__ float __cosf(float __a) { return __nv_fast_cosf(__a); }
+__DEVICE__ double __dAtomicAdd(double *__p, double __v) {
+  return __nvvm_atom_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dAtomicAdd_block(double *__p, double __v) {
+  return __nvvm_atom_cta_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dAtomicAdd_system(double *__p, double __v) {
+  return __nvvm_atom_sys_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dadd_rd(double __a, double __b) {
+  return __nv_dadd_rd(__a, __b);
+}
+__DEVICE__ double __dadd_rn(double __a, double __b) {
+  return __nv_dadd_rn(__a, __b);
+}
+__DEVICE__ double __dadd_ru(double __a, double __b) {
+  return __nv_dadd_ru(__a, __b);
+}
+__DEVICE__ double __dadd_rz(double __a, double __b) {
+  return __nv_dadd_rz(__a, __b);
+}
+__DEVICE__ double __ddiv_rd(double __a, double __b) {
+  return __nv_ddiv_rd(__a, __b);
+}
+__DEVICE__ double __ddiv_rn(double __a, double __b) {
+  return __nv_ddiv_rn(__a, __b);
+}
+__DEVICE__ double __ddiv_ru(double __a, double __b) {
+  return __nv_ddiv_ru(__a, __b);
+}
+__DEVICE__ double __ddiv_rz(double __a, double __b) {
+  return __nv_ddiv_rz(__a, __b);
+}
+__DEVICE__ double __dmul_rd(double __a, double __b) {
+  return __nv_dmul_rd(__a, __b);
+}
+__DEVICE__ double __dmul_rn(double __a, double __b) {
+  return __nv_dmul_rn(__a, __b);
+}
+__DEVICE__ double __dmul_ru(double __a, double __b) {
+  return __nv_dmul_ru(__a, __b);
+}
+__DEVICE__ double __dmul_rz(double __a, double __b) {
+  return __nv_dmul_rz(__a, __b);
+}
+__DEVICE__ float __double2float_rd(double __a) {
+  return __nv_double2float_rd(__a);
+}
+__DEVICE__ float __double2float_rn(double __a) {
+  return __nv_double2float_rn(__a);
+}
+__DEVICE__ float __double2float_ru(double __a) {
+  return __nv_double2float_ru(__a);
+}
+__DEVICE__ float __double2float_rz(double __a) {
+  return __nv_double2float_rz(__a);
+}
+__DEVICE__ int __double2hiint(double __a) { return __nv_double2hiint(__a); }
+__DEVICE__ int __double2int_rd(double __a) { return __nv_double2int_rd(__a); }
+__DEVICE__ int __double2int_rn(double __a) { return __nv_double2int_rn(__a); }
+__DEVICE__ int __double2int_ru(double __a) { return __nv_double2int_ru(__a); }
+__DEVICE__ int __double2int_rz(double __a) { return __nv_double2int_rz(__a); }
+__DEVICE__ long long __double2ll_rd(double __a) {
+  return __nv_double2ll_rd(__a);
+}
+__DEVICE__ long long __double2ll_rn(double __a) {
+  return __nv_double2ll_rn(__a);
+}
+__DEVICE__ long long __double2ll_ru(double __a) {
+  return __nv_double2ll_ru(__a);
+}
+__DEVICE__ long long __double2ll_rz(double __a) {
+  return __nv_double2ll_rz(__a);
+}
+__DEVICE__ int __double2loint(double __a) { return __nv_double2loint(__a); }
+__DEVICE__ unsigned int __double2uint_rd(double __a) {
+  return __nv_double2uint_rd(__a);
+}
+__DEVICE__ unsigned int __double2uint_rn(double __a) {
+  return __nv_double2uint_rn(__a);
+}
+__DEVICE__ unsigned int __double2uint_ru(double __a) {
+  return __nv_double2uint_ru(__a);
+}
+__DEVICE__ unsigned int __double2uint_rz(double __a) {
+  return __nv_double2uint_rz(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rd(double __a) {
+  return __nv_double2ull_rd(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rn(double __a) {
+  return __nv_double2ull_rn(__a);
+}
+__DEVICE__ unsigned long long __double2ull_ru(double __a) {
+  return __nv_double2ull_ru(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rz(double __a) {
+  return __nv_double2ull_rz(__a);
+}
+__DEVICE__ long long __double_as_longlong(double __a) {
+  return __nv_double_as_longlong(__a);
+}
+__DEVICE__ double __drcp_rd(double __a) { return __nv_drcp_rd(__a); }
+__DEVICE__ double __drcp_rn(double __a) { return __nv_drcp_rn(__a); }
+__DEVICE__ double __drcp_ru(double __a) { return __nv_drcp_ru(__a); }
+__DEVICE__ double __drcp_rz(double __a) { return __nv_drcp_rz(__a); }
+__DEVICE__ double __dsqrt_rd(double __a) { return __nv_dsqrt_rd(__a); }
+__DEVICE__ double __dsqrt_rn(double __a) { return __nv_dsqrt_rn(__a); }
+__DEVICE__ double __dsqrt_ru(double __a) { return __nv_dsqrt_ru(__a); }
+__DEVICE__ double __dsqrt_rz(double __a) { return __nv_dsqrt_rz(__a); }
+__DEVICE__ double __dsub_rd(double __a, double __b) {
+  return __nv_dsub_rd(__a, __b);
+}
+__DEVICE__ double __dsub_rn(double __a, double __b) {
+  return __nv_dsub_rn(__a, __b);
+}
+__DEVICE__ double __dsub_ru(double __a, double __b) {
+  return __nv_dsub_ru(__a, __b);
+}
+__DEVICE__ double __dsub_rz(double __a, double __b) {
+  return __nv_dsub_rz(__a, __b);
+}
+__DEVICE__ float __exp10f(float __a) { return __nv_fast_exp10f(__a); }
+__DEVICE__ float __expf(float __a) { return __nv_fast_expf(__a); }
+__DEVICE__ float __fAtomicAdd(float *__p, float __v) {
+  return __nvvm_atom_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicAdd_block(float *__p, float __v) {
+  return __nvvm_atom_cta_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicAdd_system(float *__p, float __v) {
+  return __nvvm_atom_sys_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicExch(float *__p, float __v) {
+  return __nv_int_as_float(
+      __nvvm_atom_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fAtomicExch_block(float *__p, float __v) {
+  return __nv_int_as_float(
+      __nvvm_atom_cta_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fAtomicExch_system(float *__p, float __v) {
+  return __nv_int_as_float(
+      __nvvm_atom_sys_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fadd_rd(float __a, float __b) {
+  return __nv_fadd_rd(__a, __b);
+}
+__DEVICE__ float __fadd_rn(float __a, float __b) {
+  return __nv_fadd_rn(__a, __b);
+}
+__DEVICE__ float __fadd_ru(float __a, float __b) {
+  return __nv_fadd_ru(__a, __b);
+}
+__DEVICE__ float __fadd_rz(float __a, float __b) {
+  return __nv_fadd_rz(__a, __b);
+}
+__DEVICE__ float __fdiv_rd(float __a, float __b) {
+  return __nv_fdiv_rd(__a, __b);
+}
+__DEVICE__ float __fdiv_rn(float __a, float __b) {
+  return __nv_fdiv_rn(__a, __b);
+}
+__DEVICE__ float __fdiv_ru(float __a, float __b) {
+  return __nv_fdiv_ru(__a, __b);
+}
+__DEVICE__ float __fdiv_rz(float __a, float __b) {
+  return __nv_fdiv_rz(__a, __b);
+}
+__DEVICE__ float __fdividef(float __a, float __b) {
+  return __nv_fast_fdividef(__a, __b);
+}
+__DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); }
+__DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); }
+__DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); }
+__DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); }
+__DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); }
+__DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); }
+__DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); }
+__DEVICE__ int __float2int_rz(float __a) { return __nv_float2int_rz(__a); }
+__DEVICE__ long long __float2ll_rd(float __a) { return __nv_float2ll_rd(__a); }
+__DEVICE__ long long __float2ll_rn(float __a) { return __nv_float2ll_rn(__a); }
+__DEVICE__ long long __float2ll_ru(float __a) { return __nv_float2ll_ru(__a); }
+__DEVICE__ long long __float2ll_rz(float __a) { return __nv_float2ll_rz(__a); }
+__DEVICE__ unsigned int __float2uint_rd(float __a) {
+  return __nv_float2uint_rd(__a);
+}
+__DEVICE__ unsigned int __float2uint_rn(float __a) {
+  return __nv_float2uint_rn(__a);
+}
+__DEVICE__ unsigned int __float2uint_ru(float __a) {
+  return __nv_float2uint_ru(__a);
+}
+__DEVICE__ unsigned int __float2uint_rz(float __a) {
+  return __nv_float2uint_rz(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rd(float __a) {
+  return __nv_float2ull_rd(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rn(float __a) {
+  return __nv_float2ull_rn(__a);
+}
+__DEVICE__ unsigned long long __float2ull_ru(float __a) {
+  return __nv_float2ull_ru(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rz(float __a) {
+  return __nv_float2ull_rz(__a);
+}
+__DEVICE__ int __float_as_int(float __a) { return __nv_float_as_int(__a); }
+__DEVICE__ unsigned int __float_as_uint(float __a) {
+  return __nv_float_as_uint(__a);
+}
+__DEVICE__ double __fma_rd(double __a, double __b, double __c) {
+  return __nv_fma_rd(__a, __b, __c);
+}
+__DEVICE__ double __fma_rn(double __a, double __b, double __c) {
+  return __nv_fma_rn(__a, __b, __c);
+}
+__DEVICE__ double __fma_ru(double __a, double __b, double __c) {
+  return __nv_fma_ru(__a, __b, __c);
+}
+__DEVICE__ double __fma_rz(double __a, double __b, double __c) {
+  return __nv_fma_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) {
+  return __nv_fmaf_ieee_rd(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) {
+  return __nv_fmaf_ieee_rn(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) {
+  return __nv_fmaf_ieee_ru(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) {
+  return __nv_fmaf_ieee_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rd(float __a, float __b, float __c) {
+  return __nv_fmaf_rd(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rn(float __a, float __b, float __c) {
+  return __nv_fmaf_rn(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ru(float __a, float __b, float __c) {
+  return __nv_fmaf_ru(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rz(float __a, float __b, float __c) {
+  return __nv_fmaf_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmul_rd(float __a, float __b) {
+  return __nv_fmul_rd(__a, __b);
+}
+__DEVICE__ float __fmul_rn(float __a, float __b) {
+  return __nv_fmul_rn(__a, __b);
+}
+__DEVICE__ float __fmul_ru(float __a, float __b) {
+  return __nv_fmul_ru(__a, __b);
+}
+__DEVICE__ float __fmul_rz(float __a, float __b) {
+  return __nv_fmul_rz(__a, __b);
+}
+__DEVICE__ float __frcp_rd(float __a) { return __nv_frcp_rd(__a); }
+__DEVICE__ float __frcp_rn(float __a) { return __nv_frcp_rn(__a); }
+__DEVICE__ float __frcp_ru(float __a) { return __nv_frcp_ru(__a); }
+__DEVICE__ float __frcp_rz(float __a) { return __nv_frcp_rz(__a); }
+__DEVICE__ float __frsqrt_rn(float __a) { return __nv_frsqrt_rn(__a); }
+__DEVICE__ float __fsqrt_rd(float __a) { return __nv_fsqrt_rd(__a); }
+__DEVICE__ float __fsqrt_rn(float __a) { return __nv_fsqrt_rn(__a); }
+__DEVICE__ float __fsqrt_ru(float __a) { return __nv_fsqrt_ru(__a); }
+__DEVICE__ float __fsqrt_rz(float __a) { return __nv_fsqrt_rz(__a); }
+__DEVICE__ float __fsub_rd(float __a, float __b) {
+  return __nv_fsub_rd(__a, __b);
+}
+__DEVICE__ float __fsub_rn(float __a, float __b) {
+  return __nv_fsub_rn(__a, __b);
+}
+__DEVICE__ float __fsub_ru(float __a, float __b) {
+  return __nv_fsub_ru(__a, __b);
+}
+__DEVICE__ float __fsub_rz(float __a, float __b) {
+  return __nv_fsub_rz(__a, __b);
+}
+__DEVICE__ int __hadd(int __a, int __b) { return __nv_hadd(__a, __b); }
+__DEVICE__ double __hiloint2double(int __a, int __b) {
+  return __nv_hiloint2double(__a, __b);
+}
+__DEVICE__ int __iAtomicAdd(int *__p, int __v) {
+  return __nvvm_atom_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAdd_block(int *__p, int __v) {
+  __nvvm_atom_cta_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAdd_system(int *__p, int __v) {
+  __nvvm_atom_sys_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd(int *__p, int __v) {
+  return __nvvm_atom_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd_block(int *__p, int __v) {
+  return __nvvm_atom_cta_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd_system(int *__p, int __v) {
+  return __nvvm_atom_sys_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicCAS(int *__p, int __cmp, int __v) {
+  return __nvvm_atom_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicCAS_block(int *__p, int __cmp, int __v) {
+  return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicCAS_system(int *__p, int __cmp, int __v) {
+  return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicExch(int *__p, int __v) {
+  return __nvvm_atom_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicExch_block(int *__p, int __v) {
+  return __nvvm_atom_cta_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicExch_system(int *__p, int __v) {
+  return __nvvm_atom_sys_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax(int *__p, int __v) {
+  return __nvvm_atom_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax_block(int *__p, int __v) {
+  return __nvvm_atom_cta_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax_system(int *__p, int __v) {
+  return __nvvm_atom_sys_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin(int *__p, int __v) {
+  return __nvvm_atom_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin_block(int *__p, int __v) {
+  return __nvvm_atom_cta_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin_system(int *__p, int __v) {
+  return __nvvm_atom_sys_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr(int *__p, int __v) {
+  return __nvvm_atom_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr_block(int *__p, int __v) {
+  return __nvvm_atom_cta_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr_system(int *__p, int __v) {
+  return __nvvm_atom_sys_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor(int *__p, int __v) {
+  return __nvvm_atom_xor_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor_block(int *__p, int __v) {
+  return __nvvm_atom_cta_xor_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor_system(int *__p, int __v) {
+  return __nvvm_atom_sys_xor_gen_i(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax(long long *__p, long long __v) {
+  return __nvvm_atom_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin(long long *__p, long long __v) {
+  return __nvvm_atom_min_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_min_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_min_gen_ll(__p, __v);
+}
+__DEVICE__ double __int2double_rn(int __a) { return __nv_int2double_rn(__a); }
+__DEVICE__ float __int2float_rd(int __a) { return __nv_int2float_rd(__a); }
+__DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); }
+__DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); }
+__DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); }
+__DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); }
+__DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); }
+__DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); }
+__DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); }
+__DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); }
+__DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); }
+__DEVICE__ double __ll2double_rd(long long __a) {
+  return __nv_ll2double_rd(__a);
+}
+__DEVICE__ double __ll2double_rn(long long __a) {
+  return __nv_ll2double_rn(__a);
+}
+__DEVICE__ double __ll2double_ru(long long __a) {
+  return __nv_ll2double_ru(__a);
+}
+__DEVICE__ double __ll2double_rz(long long __a) {
+  return __nv_ll2double_rz(__a);
+}
+__DEVICE__ float __ll2float_rd(long long __a) { return __nv_ll2float_rd(__a); }
+__DEVICE__ float __ll2float_rn(long long __a) { return __nv_ll2float_rn(__a); }
+__DEVICE__ float __ll2float_ru(long long __a) { return __nv_ll2float_ru(__a); }
+__DEVICE__ float __ll2float_rz(long long __a) { return __nv_ll2float_rz(__a); }
+__DEVICE__ long long __llAtomicAnd(long long *__p, long long __v) {
+  return __nvvm_atom_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicAnd_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicAnd_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr(long long *__p, long long __v) {
+  return __nvvm_atom_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor(long long *__p, long long __v) {
+  return __nvvm_atom_xor_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_xor_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_xor_gen_ll(__p, __v);
+}
+__DEVICE__ float __log10f(float __a) { return __nv_fast_log10f(__a); }
+__DEVICE__ float __log2f(float __a) { return __nv_fast_log2f(__a); }
+__DEVICE__ float __logf(float __a) { return __nv_fast_logf(__a); }
+__DEVICE__ double __longlong_as_double(long long __a) {
+  return __nv_longlong_as_double(__a);
+}
+__DEVICE__ int __mul24(int __a, int __b) { return __nv_mul24(__a, __b); }
+__DEVICE__ long long __mul64hi(long long __a, long long __b) {
+  return __nv_mul64hi(__a, __b);
+}
+__DEVICE__ int __mulhi(int __a, int __b) { return __nv_mulhi(__a, __b); }
+__DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); }
+__DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); }
+__DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); }
+__DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); }
+__DEVICE__ int __popc(int __a) { return __nv_popc(__a); }
+__DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); }
+__DEVICE__ float __powf(float __a, float __b) {
+  return __nv_fast_powf(__a, __b);
+}
+
+// Parameter must have a known integer value.
+#define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a))
+__DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); }
+__DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
+  return __nv_sad(__a, __b, __c);
+}
+__DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); }
+__DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); }
+__DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); }
+__DEVICE__ void __sincosf(float __a, float *__sptr, float *__cptr) {
+  return __nv_fast_sincosf(__a, __sptr, __cptr);
+}
+__DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); }
+__DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); }
+__DEVICE__ int __syncthreads_count(int __a) { return __nvvm_bar0_popc(__a); }
+__DEVICE__ int __syncthreads_or(int __a) { return __nvvm_bar0_or(__a); }
+__DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
+__DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
+__DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
+__DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
+__DEVICE__ void __trap(void) { asm volatile("trap;"); }
+__DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_add_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAdd_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_add_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAdd_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_add_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_and_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_and_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_and_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicCAS(unsigned int *__p, unsigned int __cmp,
+                                     unsigned int __v) {
+  return __nvvm_atom_cas_gen_i((int *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned int
+__uAtomicCAS_block(unsigned int *__p, unsigned int __cmp, unsigned int __v) {
+  return __nvvm_atom_cta_cas_gen_i((int *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned int
+__uAtomicCAS_system(unsigned int *__p, unsigned int __cmp, unsigned int __v) {
+  return __nvvm_atom_sys_cas_gen_i((int *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_xchg_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch_block(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_cta_xchg_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch_system(unsigned int *__p,
+                                             unsigned int __v) {
+  return __nvvm_atom_sys_xchg_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_or_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr_block(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_cta_or_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr_system(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_sys_or_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_xor_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_xor_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_xor_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b) {
+  return __nv_uhadd(__a, __b);
+}
+__DEVICE__ double __uint2double_rn(unsigned int __a) {
+  return __nv_uint2double_rn(__a);
+}
+__DEVICE__ float __uint2float_rd(unsigned int __a) {
+  return __nv_uint2float_rd(__a);
+}
+__DEVICE__ float __uint2float_rn(unsigned int __a) {
+  return __nv_uint2float_rn(__a);
+}
+__DEVICE__ float __uint2float_ru(unsigned int __a) {
+  return __nv_uint2float_ru(__a);
+}
+__DEVICE__ float __uint2float_rz(unsigned int __a) {
+  return __nv_uint2float_rz(__a);
+}
+__DEVICE__ float __uint_as_float(unsigned int __a) {
+  return __nv_uint_as_float(__a);
+} //
+__DEVICE__ double __ull2double_rd(unsigned long long __a) {
+  return __nv_ull2double_rd(__a);
+}
+__DEVICE__ double __ull2double_rn(unsigned long long __a) {
+  return __nv_ull2double_rn(__a);
+}
+__DEVICE__ double __ull2double_ru(unsigned long long __a) {
+  return __nv_ull2double_ru(__a);
+}
+__DEVICE__ double __ull2double_rz(unsigned long long __a) {
+  return __nv_ull2double_rz(__a);
+}
+__DEVICE__ float __ull2float_rd(unsigned long long __a) {
+  return __nv_ull2float_rd(__a);
+}
+__DEVICE__ float __ull2float_rn(unsigned long long __a) {
+  return __nv_ull2float_rn(__a);
+}
+__DEVICE__ float __ull2float_ru(unsigned long long __a) {
+  return __nv_ull2float_ru(__a);
+}
+__DEVICE__ float __ull2float_rz(unsigned long long __a) {
+  return __nv_ull2float_rz(__a);
+}
+__DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_add_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_add_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_and_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_and_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_and_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long *__p,
+                                             unsigned long long __cmp,
+                                             unsigned long long __v) {
+  return __nvvm_atom_cas_gen_ll((long long *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p,
+                                                   unsigned long long __cmp,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_cas_gen_ll((long long *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicCAS_system(unsigned long long *__p,
+                                                    unsigned long long __cmp,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_cas_gen_ll((long long *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicExch(unsigned long long *__p,
+                                              unsigned long long __v) {
+  return __nvvm_atom_xchg_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long *__p,
+                                                     unsigned long long __v) {
+  return __nvvm_atom_sys_xchg_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr(unsigned long long *__p,
+                                            unsigned long long __v) {
+  return __nvvm_atom_or_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long *__p,
+                                                  unsigned long long __v) {
+  return __nvvm_atom_cta_or_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_sys_or_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_xor_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_xor_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_xor_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b) {
+  return __nv_umul24(__a, __b);
+}
+__DEVICE__ unsigned long long __umul64hi(unsigned long long __a,
+                                         unsigned long long __b) {
+  return __nv_umul64hi(__a, __b);
+}
+__DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b) {
+  return __nv_umulhi(__a, __b);
+}
+__DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b) {
+  return __nv_urhadd(__a, __b);
+}
+__DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b,
+                               unsigned int __c) {
+  return __nv_usad(__a, __b, __c);
+}
+
+#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
+__DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); }
+__DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); }
+__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
+  return __nv_vabsdiffs2(__a, __b);
+}
+__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
+  return __nv_vabsdiffs4(__a, __b);
+}
+__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
+  return __nv_vabsdiffu2(__a, __b);
+}
+__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
+  return __nv_vabsdiffu4(__a, __b);
+}
+__DEVICE__ unsigned int __vabsss2(unsigned int __a) {
+  return __nv_vabsss2(__a);
+}
+__DEVICE__ unsigned int __vabsss4(unsigned int __a) {
+  return __nv_vabsss4(__a);
+}
+__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
+  return __nv_vadd2(__a, __b);
+}
+__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
+  return __nv_vadd4(__a, __b);
+}
+__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
+  return __nv_vaddss2(__a, __b);
+}
+__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
+  return __nv_vaddss4(__a, __b);
+}
+__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
+  return __nv_vaddus2(__a, __b);
+}
+__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
+  return __nv_vaddus4(__a, __b);
+}
+__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
+  return __nv_vavgs2(__a, __b);
+}
+__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
+  return __nv_vavgs4(__a, __b);
+}
+__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
+  return __nv_vavgu2(__a, __b);
+}
+__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
+  return __nv_vavgu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpeq2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpeq4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpges2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpges4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgeu2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgeu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgts2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgts4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgtu2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgtu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmples2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmples4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpleu2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpleu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmplts2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmplts4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpltu2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpltu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpne2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpne4(__a, __b);
+}
+__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) {
+  return __nv_vhaddu2(__a, __b);
+}
+__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) {
+  return __nv_vhaddu4(__a, __b);
+}
+__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
+  return __nv_vmaxs2(__a, __b);
+}
+__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
+  return __nv_vmaxs4(__a, __b);
+}
+__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
+  return __nv_vmaxu2(__a, __b);
+}
+__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
+  return __nv_vmaxu4(__a, __b);
+}
+__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
+  return __nv_vmins2(__a, __b);
+}
+__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
+  return __nv_vmins4(__a, __b);
+}
+__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
+  return __nv_vminu2(__a, __b);
+}
+__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
+  return __nv_vminu4(__a, __b);
+}
+__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __nv_vneg2(__a); }
+__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __nv_vneg4(__a); }
+__DEVICE__ unsigned int __vnegss2(unsigned int __a) {
+  return __nv_vnegss2(__a);
+}
+__DEVICE__ unsigned int __vnegss4(unsigned int __a) {
+  return __nv_vnegss4(__a);
+}
+__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
+  return __nv_vsads2(__a, __b);
+}
+__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
+  return __nv_vsads4(__a, __b);
+}
+__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsadu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsadu4(__a, __b);
+}
+__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
+  return __nv_vseteq2(__a, __b);
+}
+__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
+  return __nv_vseteq4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetges2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetges4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgeu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgeu4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgts2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgts4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgtu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgtu4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetles2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetles4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetleu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetleu4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetlts2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetlts4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetltu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetltu4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetne2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetne4(__a, __b);
+}
+__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
+  return __nv_vsub2(__a, __b);
+}
+__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
+  return __nv_vsub4(__a, __b);
+}
+__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
+  return __nv_vsubss2(__a, __b);
+}
+__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
+  return __nv_vsubss4(__a, __b);
+}
+__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
+  return __nv_vsubus2(__a, __b);
+}
+__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
+  return __nv_vsubus4(__a, __b);
+}
+#else // CUDA_VERSION >= 9020
+// CUDA no longer provides inline assembly (or bitcode) implementation of these
+// functions, so we have to reimplment them. The implementation is naive and is
+// not optimized for performance.
+
+// Helper function to convert N-bit boolean subfields into all-0 or all-1.
+// E.g. __bool2mask(0x01000100,8) -> 0xff00ff00
+//      __bool2mask(0x00010000,16) -> 0xffff0000
+__DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) {
+  return (__a << shift) - __a;
+}
+__DEVICE__ unsigned int __vabs2(unsigned int __a) {
+  unsigned int r;
+  asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(0), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabs4(unsigned int __a) {
+  unsigned int r;
+  asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(0), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+
+__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsss2(unsigned int __a) {
+  unsigned int r;
+  asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(0), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsss4(unsigned int __a) {
+  unsigned int r;
+  asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(0), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vseteq2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vseteq4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetges2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetges4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetgeu2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetgeu4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetgts2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetgts4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetgtu2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetgtu4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetles2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetles4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetleu2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetleu4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetlts2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetlts4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetltu2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetltu4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetne2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
+  return __bool2mask(__vsetne4(__a, __b), 8);
+}
+
+// Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086
+// (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) =>
+// (a + b) / 2 = ((a ^ b) >> 1) + (a & b)
+// To operate on multiple sub-elements we need to make sure to mask out bits
+// that crossed over into adjacent elements during the shift.
+__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) {
+  return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b);
+}
+__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) {
+  return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b);
+}
+
+__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  if ((__a & 0x8000) && (__b & 0x8000)) {
+    // Work around a bug in ptxas which produces invalid result if low element
+    // is negative.
+    unsigned mask = __vcmpgts2(__a, __b);
+    r = (__a & mask) | (__b & ~mask);
+  } else {
+    asm("vmax2.s32.s32.s32 %0,%1,%2,%3;"
+        : "=r"(r)
+        : "r"(__a), "r"(__b), "r"(0));
+  }
+  return r;
+}
+__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+
+__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); }
+
+__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); }
+__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vnegss2(unsigned int __a) {
+  return __vsubss2(0, __a);
+}
+__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vsub4.s32.s32.s32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vnegss4(unsigned int __a) {
+  return __vsubss4(0, __a);
+}
+__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vsub2.u32.u32.u32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
+  unsigned int r;
+  asm("vsub4.u32.u32.u32.sat %0,%1,%2,%3;"
+      : "=r"(r)
+      : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+#endif // CUDA_VERSION >= 9020
+__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
+__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
+__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
+__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
+__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
+__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
+__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
+__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
+__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
+__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
+__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
+__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
+__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
+__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
+__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
+__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
+__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
+__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
+__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
+__DEVICE__ int clock() { return __nvvm_read_ptx_sreg_clock(); }
+__DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); }
+__DEVICE__ double copysign(double __a, double __b) {
+  return __nv_copysign(__a, __b);
+}
+__DEVICE__ float copysignf(float __a, float __b) {
+  return __nv_copysignf(__a, __b);
+}
+__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
+__DEVICE__ float cosf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
+}
+__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
+__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
+__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
+__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
+__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
+__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
+__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
+__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
+__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
+__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
+__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
+__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
+__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
+__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
+__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
+__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
+__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
+__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
+__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
+__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
+__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
+__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
+__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
+__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
+__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
+__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
+__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
+__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
+__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
+__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
+__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
+__DEVICE__ float fdividef(float __a, float __b) {
+#if __FAST_MATH__ && !__CUDA_PREC_DIV
+  return __nv_fast_fdividef(__a, __b);
+#else
+  return __a / __b;
+#endif
+}
+__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
+__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
+__DEVICE__ double fma(double __a, double __b, double __c) {
+  return __nv_fma(__a, __b, __c);
+}
+__DEVICE__ float fmaf(float __a, float __b, float __c) {
+  return __nv_fmaf(__a, __b, __c);
+}
+__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
+__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
+__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
+__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
+__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
+__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
+__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
+__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
+__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
+__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
+__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
+__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
+__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
+__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
+__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
+__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
+__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
+__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
+#if defined(__LP64__)
+__DEVICE__ long labs(long __a) { return llabs(__a); };
+#else
+__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
+#endif
+__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
+__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
+__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
+__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
+__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
+__DEVICE__ long long llmax(long long __a, long long __b) {
+  return __nv_llmax(__a, __b);
+}
+__DEVICE__ long long llmin(long long __a, long long __b) {
+  return __nv_llmin(__a, __b);
+}
+__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
+__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
+__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
+__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
+__DEVICE__ double log(double __a) { return __nv_log(__a); }
+__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
+__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
+__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
+__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
+__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
+__DEVICE__ float log2f(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
+}
+__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
+__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
+__DEVICE__ float logf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
+}
+#if defined(__LP64__)
+__DEVICE__ long lrint(double __a) { return llrint(__a); }
+__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
+__DEVICE__ long lround(double __a) { return llround(__a); }
+__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
+#else
+__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
+__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
+__DEVICE__ long lround(double __a) { return round(__a); }
+__DEVICE__ long lroundf(float __a) { return roundf(__a); }
+#endif
+__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
+__DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) {
+  return __builtin_memcpy(__a, __b, __c);
+}
+__DEVICE__ void *memset(void *__a, int __b, size_t __c) {
+  return __builtin_memset(__a, __b, __c);
+}
+__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
+__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
+__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
+__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
+__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
+__DEVICE__ double nextafter(double __a, double __b) {
+  return __nv_nextafter(__a, __b);
+}
+__DEVICE__ float nextafterf(float __a, float __b) {
+  return __nv_nextafterf(__a, __b);
+}
+__DEVICE__ double norm(int __dim, const double *__t) {
+  return __nv_norm(__dim, __t);
+}
+__DEVICE__ double norm3d(double __a, double __b, double __c) {
+  return __nv_norm3d(__a, __b, __c);
+}
+__DEVICE__ float norm3df(float __a, float __b, float __c) {
+  return __nv_norm3df(__a, __b, __c);
+}
+__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
+  return __nv_norm4d(__a, __b, __c, __d);
+}
+__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
+  return __nv_norm4df(__a, __b, __c, __d);
+}
+__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
+__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
+__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
+__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
+__DEVICE__ float normf(int __dim, const float *__t) {
+  return __nv_normf(__dim, __t);
+}
+__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
+__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
+__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
+__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
+__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
+__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
+__DEVICE__ double remainder(double __a, double __b) {
+  return __nv_remainder(__a, __b);
+}
+__DEVICE__ float remainderf(float __a, float __b) {
+  return __nv_remainderf(__a, __b);
+}
+__DEVICE__ double remquo(double __a, double __b, int *__c) {
+  return __nv_remquo(__a, __b, __c);
+}
+__DEVICE__ float remquof(float __a, float __b, int *__c) {
+  return __nv_remquof(__a, __b, __c);
+}
+__DEVICE__ double rhypot(double __a, double __b) {
+  return __nv_rhypot(__a, __b);
+}
+__DEVICE__ float rhypotf(float __a, float __b) {
+  return __nv_rhypotf(__a, __b);
+}
+__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
+__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
+__DEVICE__ double rnorm(int __a, const double *__b) {
+  return __nv_rnorm(__a, __b);
+}
+__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
+  return __nv_rnorm3d(__a, __b, __c);
+}
+__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
+  return __nv_rnorm3df(__a, __b, __c);
+}
+__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
+  return __nv_rnorm4d(__a, __b, __c, __d);
+}
+__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
+  return __nv_rnorm4df(__a, __b, __c, __d);
+}
+__DEVICE__ float rnormf(int __dim, const float *__t) {
+  return __nv_rnormf(__dim, __t);
+}
+__DEVICE__ double round(double __a) { return __nv_round(__a); }
+__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
+__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
+__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
+__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
+__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
+__DEVICE__ double scalbln(double __a, long __b) {
+  if (__b > INT_MAX)
+    return __a > 0 ? HUGE_VAL : -HUGE_VAL;
+  if (__b < INT_MIN)
+    return __a > 0 ? 0.0 : -0.0;
+  return scalbn(__a, (int)__b);
+}
+__DEVICE__ float scalblnf(float __a, long __b) {
+  if (__b > INT_MAX)
+    return __a > 0 ? HUGE_VALF : -HUGE_VALF;
+  if (__b < INT_MIN)
+    return __a > 0 ? 0.f : -0.f;
+  return scalbnf(__a, (int)__b);
+}
+__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
+__DEVICE__ void sincos(double __a, double *__sptr, double *__cptr) {
+  return __nv_sincos(__a, __sptr, __cptr);
+}
+__DEVICE__ void sincosf(float __a, float *__sptr, float *__cptr) {
+  return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __sptr, __cptr);
+}
+__DEVICE__ void sincospi(double __a, double *__sptr, double *__cptr) {
+  return __nv_sincospi(__a, __sptr, __cptr);
+}
+__DEVICE__ void sincospif(float __a, float *__sptr, float *__cptr) {
+  return __nv_sincospif(__a, __sptr, __cptr);
+}
+__DEVICE__ float sinf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
+}
+__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
+__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
+__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
+__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
+__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
+__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
+__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
+__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
+__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
+__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
+__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
+__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
+__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
+__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
+__DEVICE__ unsigned long long ullmax(unsigned long long __a,
+                                     unsigned long long __b) {
+  return __nv_ullmax(__a, __b);
+}
+__DEVICE__ unsigned long long ullmin(unsigned long long __a,
+                                     unsigned long long __b) {
+  return __nv_ullmin(__a, __b);
+}
+__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
+  return __nv_umax(__a, __b);
+}
+__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
+  return __nv_umin(__a, __b);
+}
+__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
+__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
+__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
+__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
+__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
+__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
+
+#pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__FAST_OR_SLOW")
+#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
diff --git a/darwin-x86/clang-headers/__clang_cuda_intrinsics.h b/darwin-x86/clang-headers/__clang_cuda_intrinsics.h
index 3df41fa..3c0cde9 100644
--- a/darwin-x86/clang-headers/__clang_cuda_intrinsics.h
+++ b/darwin-x86/clang-headers/__clang_cuda_intrinsics.h
@@ -34,64 +34,231 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
 
 #pragma push_macro("__MAKE_SHUFFLES")
-#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask)    \
-  inline __device__ int __FnName(int __in, int __offset,                       \
+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask,    \
+                        __Type)                                                \
+  inline __device__ int __FnName(int __val, __Type __offset,                   \
                                  int __width = warpSize) {                     \
-    return __IntIntrinsic(__in, __offset,                                      \
+    return __IntIntrinsic(__val, __offset,                                     \
                           ((warpSize - __width) << 8) | (__Mask));             \
   }                                                                            \
-  inline __device__ float __FnName(float __in, int __offset,                   \
+  inline __device__ float __FnName(float __val, __Type __offset,               \
                                    int __width = warpSize) {                   \
-    return __FloatIntrinsic(__in, __offset,                                    \
+    return __FloatIntrinsic(__val, __offset,                                   \
                             ((warpSize - __width) << 8) | (__Mask));           \
   }                                                                            \
-  inline __device__ unsigned int __FnName(unsigned int __in, int __offset,     \
+  inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
                                           int __width = warpSize) {            \
     return static_cast<unsigned int>(                                          \
-        ::__FnName(static_cast<int>(__in), __offset, __width));                \
+        ::__FnName(static_cast<int>(__val), __offset, __width));               \
   }                                                                            \
-  inline __device__ long long __FnName(long long __in, int __offset,           \
+  inline __device__ long long __FnName(long long __val, __Type __offset,       \
                                        int __width = warpSize) {               \
     struct __Bits {                                                            \
       int __a, __b;                                                            \
     };                                                                         \
-    _Static_assert(sizeof(__in) == sizeof(__Bits));                            \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
     _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
     __Bits __tmp;                                                              \
-    memcpy(&__in, &__tmp, sizeof(__in));                                       \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
     __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
     __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
-    long long __out;                                                           \
-    memcpy(&__out, &__tmp, sizeof(__tmp));                                     \
-    return __out;                                                              \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ long __FnName(long __val, __Type __offset,                 \
+                                  int __width = warpSize) {                    \
+    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
+                   sizeof(long) == sizeof(int));                               \
+    if (sizeof(long) == sizeof(long long)) {                                   \
+      return static_cast<long>(                                                \
+          ::__FnName(static_cast<long long>(__val), __offset, __width));       \
+    } else if (sizeof(long) == sizeof(int)) {                                  \
+      return static_cast<long>(                                                \
+          ::__FnName(static_cast<int>(__val), __offset, __width));             \
+    }                                                                          \
+  }                                                                            \
+  inline __device__ unsigned long __FnName(                                    \
+      unsigned long __val, __Type __offset, int __width = warpSize) {          \
+    return static_cast<unsigned long>(                                         \
+        ::__FnName(static_cast<long>(__val), __offset, __width));              \
   }                                                                            \
   inline __device__ unsigned long long __FnName(                               \
-      unsigned long long __in, int __offset, int __width = warpSize) {         \
-    return static_cast<unsigned long long>(                                    \
-        ::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \
+      unsigned long long __val, __Type __offset, int __width = warpSize) {     \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        static_cast<unsigned long long>(__val), __offset, __width));           \
   }                                                                            \
-  inline __device__ double __FnName(double __in, int __offset,                 \
+  inline __device__ double __FnName(double __val, __Type __offset,             \
                                     int __width = warpSize) {                  \
     long long __tmp;                                                           \
-    _Static_assert(sizeof(__tmp) == sizeof(__in));                             \
-    memcpy(&__tmp, &__in, sizeof(__in));                                       \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
     __tmp = ::__FnName(__tmp, __offset, __width);                              \
-    double __out;                                                              \
-    memcpy(&__out, &__tmp, sizeof(__out));                                     \
-    return __out;                                                              \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
   }
 
-__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
+__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
 // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
 // maxLane.
-__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
-__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
-__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
-
+__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
+                unsigned int);
+__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
+                unsigned int);
+__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
+                int);
 #pragma pop_macro("__MAKE_SHUFFLES")
 
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
 
+#if CUDA_VERSION >= 9000
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
+// __shfl_sync_* variants available in CUDA-9
+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
+                             __Mask, __Type)                                   \
+  inline __device__ int __FnName(unsigned int __mask, int __val,               \
+                                 __Type __offset, int __width = warpSize) {    \
+    return __IntIntrinsic(__mask, __val, __offset,                             \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(unsigned int __mask, float __val,           \
+                                   __Type __offset, int __width = warpSize) {  \
+    return __FloatIntrinsic(__mask, __val, __offset,                           \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __mask,                 \
+                                          unsigned int __val, __Type __offset, \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
+  }                                                                            \
+  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
+                                       __Type __offset,                        \
+                                       int __width = warpSize) {               \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
+    __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width);              \
+    __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width);              \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned int __mask, unsigned long long __val, __Type __offset,          \
+      int __width = warpSize) {                                                \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
+  }                                                                            \
+  inline __device__ long __FnName(unsigned int __mask, long __val,             \
+                                  __Type __offset, int __width = warpSize) {   \
+    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
+                   sizeof(long) == sizeof(int));                               \
+    if (sizeof(long) == sizeof(long long)) {                                   \
+      return static_cast<long>(::__FnName(                                     \
+          __mask, static_cast<long long>(__val), __offset, __width));          \
+    } else if (sizeof(long) == sizeof(int)) {                                  \
+      return static_cast<long>(                                                \
+          ::__FnName(__mask, static_cast<int>(__val), __offset, __width));     \
+    }                                                                          \
+  }                                                                            \
+  inline __device__ unsigned long __FnName(                                    \
+      unsigned int __mask, unsigned long __val, __Type __offset,               \
+      int __width = warpSize) {                                                \
+    return static_cast<unsigned long>(                                         \
+        ::__FnName(__mask, static_cast<long>(__val), __offset, __width));      \
+  }                                                                            \
+  inline __device__ double __FnName(unsigned int __mask, double __val,         \
+                                    __Type __offset, int __width = warpSize) { \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
+    __tmp = ::__FnName(__mask, __tmp, __offset, __width);                      \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
+  }
+__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
+                     __nvvm_shfl_sync_idx_f32, 0x1f, int);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
+                     __nvvm_shfl_sync_up_f32, 0, unsigned int);
+__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
+                     __nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
+__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
+                     __nvvm_shfl_sync_bfly_f32, 0x1f, int);
+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
+
+inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
+  return __nvvm_bar_warp_sync(mask);
+}
+
+inline __device__ void __barrier_sync(unsigned int id) {
+  __nvvm_barrier_sync(id);
+}
+
+inline __device__ void __barrier_sync_count(unsigned int id,
+                                            unsigned int count) {
+  __nvvm_barrier_sync_cnt(id, count);
+}
+
+inline __device__ int __all_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_all_sync(mask, pred);
+}
+
+inline __device__ int __any_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_any_sync(mask, pred);
+}
+
+inline __device__ int __uni_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_uni_sync(mask, pred);
+}
+
+inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+
+inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
+  return __nvvm_fns(mask, base, offset);
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+// Define __match* builtins CUDA-9 headers expect to see.
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+inline __device__ unsigned int __match32_any_sync(unsigned int mask,
+                                                  unsigned int value) {
+  return __nvvm_match_any_sync_i32(mask, value);
+}
+
+inline __device__ unsigned long long
+__match64_any_sync(unsigned int mask, unsigned long long value) {
+  return __nvvm_match_any_sync_i64(mask, value);
+}
+
+inline __device__ unsigned int
+__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
+  return __nvvm_match_all_sync_i32p(mask, value, pred);
+}
+
+inline __device__ unsigned long long
+__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
+  return __nvvm_match_all_sync_i64p(mask, value, pred);
+}
+#include "crt/sm_70_rt.hpp"
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#endif // __CUDA_VERSION >= 9000
+
 // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
 
 // Prevent the vanilla sm_32 intrinsics header from being included.
@@ -110,6 +277,9 @@
 inline __device__ unsigned char __ldg(const unsigned char *ptr) {
   return __nvvm_ldg_uc(ptr);
 }
+inline __device__ signed char __ldg(const signed char *ptr) {
+  return __nvvm_ldg_uc((const unsigned char *)ptr);
+}
 inline __device__ unsigned short __ldg(const unsigned short *ptr) {
   return __nvvm_ldg_us(ptr);
 }
diff --git a/darwin-x86/clang-headers/__clang_cuda_libdevice_declares.h b/darwin-x86/clang-headers/__clang_cuda_libdevice_declares.h
new file mode 100644
index 0000000..71df7f8
--- /dev/null
+++ b/darwin-x86/clang-headers/__clang_cuda_libdevice_declares.h
@@ -0,0 +1,466 @@
+/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+
+extern "C" {
+
+__device__ int __nv_abs(int __a);
+__device__ double __nv_acos(double __a);
+__device__ float __nv_acosf(float __a);
+__device__ double __nv_acosh(double __a);
+__device__ float __nv_acoshf(float __a);
+__device__ double __nv_asin(double __a);
+__device__ float __nv_asinf(float __a);
+__device__ double __nv_asinh(double __a);
+__device__ float __nv_asinhf(float __a);
+__device__ double __nv_atan2(double __a, double __b);
+__device__ float __nv_atan2f(float __a, float __b);
+__device__ double __nv_atan(double __a);
+__device__ float __nv_atanf(float __a);
+__device__ double __nv_atanh(double __a);
+__device__ float __nv_atanhf(float __a);
+__device__ int __nv_brev(int __a);
+__device__ long long __nv_brevll(long long __a);
+__device__ int __nv_byte_perm(int __a, int __b, int __c);
+__device__ double __nv_cbrt(double __a);
+__device__ float __nv_cbrtf(float __a);
+__device__ double __nv_ceil(double __a);
+__device__ float __nv_ceilf(float __a);
+__device__ int __nv_clz(int __a);
+__device__ int __nv_clzll(long long __a);
+__device__ double __nv_copysign(double __a, double __b);
+__device__ float __nv_copysignf(float __a, float __b);
+__device__ double __nv_cos(double __a);
+__device__ float __nv_cosf(float __a);
+__device__ double __nv_cosh(double __a);
+__device__ float __nv_coshf(float __a);
+__device__ double __nv_cospi(double __a);
+__device__ float __nv_cospif(float __a);
+__device__ double __nv_cyl_bessel_i0(double __a);
+__device__ float __nv_cyl_bessel_i0f(float __a);
+__device__ double __nv_cyl_bessel_i1(double __a);
+__device__ float __nv_cyl_bessel_i1f(float __a);
+__device__ double __nv_dadd_rd(double __a, double __b);
+__device__ double __nv_dadd_rn(double __a, double __b);
+__device__ double __nv_dadd_ru(double __a, double __b);
+__device__ double __nv_dadd_rz(double __a, double __b);
+__device__ double __nv_ddiv_rd(double __a, double __b);
+__device__ double __nv_ddiv_rn(double __a, double __b);
+__device__ double __nv_ddiv_ru(double __a, double __b);
+__device__ double __nv_ddiv_rz(double __a, double __b);
+__device__ double __nv_dmul_rd(double __a, double __b);
+__device__ double __nv_dmul_rn(double __a, double __b);
+__device__ double __nv_dmul_ru(double __a, double __b);
+__device__ double __nv_dmul_rz(double __a, double __b);
+__device__ float __nv_double2float_rd(double __a);
+__device__ float __nv_double2float_rn(double __a);
+__device__ float __nv_double2float_ru(double __a);
+__device__ float __nv_double2float_rz(double __a);
+__device__ int __nv_double2hiint(double __a);
+__device__ int __nv_double2int_rd(double __a);
+__device__ int __nv_double2int_rn(double __a);
+__device__ int __nv_double2int_ru(double __a);
+__device__ int __nv_double2int_rz(double __a);
+__device__ long long __nv_double2ll_rd(double __a);
+__device__ long long __nv_double2ll_rn(double __a);
+__device__ long long __nv_double2ll_ru(double __a);
+__device__ long long __nv_double2ll_rz(double __a);
+__device__ int __nv_double2loint(double __a);
+__device__ unsigned int __nv_double2uint_rd(double __a);
+__device__ unsigned int __nv_double2uint_rn(double __a);
+__device__ unsigned int __nv_double2uint_ru(double __a);
+__device__ unsigned int __nv_double2uint_rz(double __a);
+__device__ unsigned long long __nv_double2ull_rd(double __a);
+__device__ unsigned long long __nv_double2ull_rn(double __a);
+__device__ unsigned long long __nv_double2ull_ru(double __a);
+__device__ unsigned long long __nv_double2ull_rz(double __a);
+__device__ unsigned long long __nv_double_as_longlong(double __a);
+__device__ double __nv_drcp_rd(double __a);
+__device__ double __nv_drcp_rn(double __a);
+__device__ double __nv_drcp_ru(double __a);
+__device__ double __nv_drcp_rz(double __a);
+__device__ double __nv_dsqrt_rd(double __a);
+__device__ double __nv_dsqrt_rn(double __a);
+__device__ double __nv_dsqrt_ru(double __a);
+__device__ double __nv_dsqrt_rz(double __a);
+__device__ double __nv_dsub_rd(double __a, double __b);
+__device__ double __nv_dsub_rn(double __a, double __b);
+__device__ double __nv_dsub_ru(double __a, double __b);
+__device__ double __nv_dsub_rz(double __a, double __b);
+__device__ double __nv_erfc(double __a);
+__device__ float __nv_erfcf(float __a);
+__device__ double __nv_erfcinv(double __a);
+__device__ float __nv_erfcinvf(float __a);
+__device__ double __nv_erfcx(double __a);
+__device__ float __nv_erfcxf(float __a);
+__device__ double __nv_erf(double __a);
+__device__ float __nv_erff(float __a);
+__device__ double __nv_erfinv(double __a);
+__device__ float __nv_erfinvf(float __a);
+__device__ double __nv_exp10(double __a);
+__device__ float __nv_exp10f(float __a);
+__device__ double __nv_exp2(double __a);
+__device__ float __nv_exp2f(float __a);
+__device__ double __nv_exp(double __a);
+__device__ float __nv_expf(float __a);
+__device__ double __nv_expm1(double __a);
+__device__ float __nv_expm1f(float __a);
+__device__ double __nv_fabs(double __a);
+__device__ float __nv_fabsf(float __a);
+__device__ float __nv_fadd_rd(float __a, float __b);
+__device__ float __nv_fadd_rn(float __a, float __b);
+__device__ float __nv_fadd_ru(float __a, float __b);
+__device__ float __nv_fadd_rz(float __a, float __b);
+__device__ float __nv_fast_cosf(float __a);
+__device__ float __nv_fast_exp10f(float __a);
+__device__ float __nv_fast_expf(float __a);
+__device__ float __nv_fast_fdividef(float __a, float __b);
+__device__ float __nv_fast_log10f(float __a);
+__device__ float __nv_fast_log2f(float __a);
+__device__ float __nv_fast_logf(float __a);
+__device__ float __nv_fast_powf(float __a, float __b);
+__device__ void __nv_fast_sincosf(float __a, float *__sptr, float *__cptr);
+__device__ float __nv_fast_sinf(float __a);
+__device__ float __nv_fast_tanf(float __a);
+__device__ double __nv_fdim(double __a, double __b);
+__device__ float __nv_fdimf(float __a, float __b);
+__device__ float __nv_fdiv_rd(float __a, float __b);
+__device__ float __nv_fdiv_rn(float __a, float __b);
+__device__ float __nv_fdiv_ru(float __a, float __b);
+__device__ float __nv_fdiv_rz(float __a, float __b);
+__device__ int __nv_ffs(int __a);
+__device__ int __nv_ffsll(long long __a);
+__device__ int __nv_finitef(float __a);
+__device__ unsigned short __nv_float2half_rn(float __a);
+__device__ int __nv_float2int_rd(float __a);
+__device__ int __nv_float2int_rn(float __a);
+__device__ int __nv_float2int_ru(float __a);
+__device__ int __nv_float2int_rz(float __a);
+__device__ long long __nv_float2ll_rd(float __a);
+__device__ long long __nv_float2ll_rn(float __a);
+__device__ long long __nv_float2ll_ru(float __a);
+__device__ long long __nv_float2ll_rz(float __a);
+__device__ unsigned int __nv_float2uint_rd(float __a);
+__device__ unsigned int __nv_float2uint_rn(float __a);
+__device__ unsigned int __nv_float2uint_ru(float __a);
+__device__ unsigned int __nv_float2uint_rz(float __a);
+__device__ unsigned long long __nv_float2ull_rd(float __a);
+__device__ unsigned long long __nv_float2ull_rn(float __a);
+__device__ unsigned long long __nv_float2ull_ru(float __a);
+__device__ unsigned long long __nv_float2ull_rz(float __a);
+__device__ int __nv_float_as_int(float __a);
+__device__ unsigned int __nv_float_as_uint(float __a);
+__device__ double __nv_floor(double __a);
+__device__ float __nv_floorf(float __a);
+__device__ double __nv_fma(double __a, double __b, double __c);
+__device__ float __nv_fmaf(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rd(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rn(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ru(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rz(float __a, float __b, float __c);
+__device__ double __nv_fma_rd(double __a, double __b, double __c);
+__device__ double __nv_fma_rn(double __a, double __b, double __c);
+__device__ double __nv_fma_ru(double __a, double __b, double __c);
+__device__ double __nv_fma_rz(double __a, double __b, double __c);
+__device__ double __nv_fmax(double __a, double __b);
+__device__ float __nv_fmaxf(float __a, float __b);
+__device__ double __nv_fmin(double __a, double __b);
+__device__ float __nv_fminf(float __a, float __b);
+__device__ double __nv_fmod(double __a, double __b);
+__device__ float __nv_fmodf(float __a, float __b);
+__device__ float __nv_fmul_rd(float __a, float __b);
+__device__ float __nv_fmul_rn(float __a, float __b);
+__device__ float __nv_fmul_ru(float __a, float __b);
+__device__ float __nv_fmul_rz(float __a, float __b);
+__device__ float __nv_frcp_rd(float __a);
+__device__ float __nv_frcp_rn(float __a);
+__device__ float __nv_frcp_ru(float __a);
+__device__ float __nv_frcp_rz(float __a);
+__device__ double __nv_frexp(double __a, int *__b);
+__device__ float __nv_frexpf(float __a, int *__b);
+__device__ float __nv_frsqrt_rn(float __a);
+__device__ float __nv_fsqrt_rd(float __a);
+__device__ float __nv_fsqrt_rn(float __a);
+__device__ float __nv_fsqrt_ru(float __a);
+__device__ float __nv_fsqrt_rz(float __a);
+__device__ float __nv_fsub_rd(float __a, float __b);
+__device__ float __nv_fsub_rn(float __a, float __b);
+__device__ float __nv_fsub_ru(float __a, float __b);
+__device__ float __nv_fsub_rz(float __a, float __b);
+__device__ int __nv_hadd(int __a, int __b);
+__device__ float __nv_half2float(unsigned short __h);
+__device__ double __nv_hiloint2double(int __a, int __b);
+__device__ double __nv_hypot(double __a, double __b);
+__device__ float __nv_hypotf(float __a, float __b);
+__device__ int __nv_ilogb(double __a);
+__device__ int __nv_ilogbf(float __a);
+__device__ double __nv_int2double_rn(int __a);
+__device__ float __nv_int2float_rd(int __a);
+__device__ float __nv_int2float_rn(int __a);
+__device__ float __nv_int2float_ru(int __a);
+__device__ float __nv_int2float_rz(int __a);
+__device__ float __nv_int_as_float(int __a);
+__device__ int __nv_isfinited(double __a);
+__device__ int __nv_isinfd(double __a);
+__device__ int __nv_isinff(float __a);
+__device__ int __nv_isnand(double __a);
+__device__ int __nv_isnanf(float __a);
+__device__ double __nv_j0(double __a);
+__device__ float __nv_j0f(float __a);
+__device__ double __nv_j1(double __a);
+__device__ float __nv_j1f(float __a);
+__device__ float __nv_jnf(int __a, float __b);
+__device__ double __nv_jn(int __a, double __b);
+__device__ double __nv_ldexp(double __a, int __b);
+__device__ float __nv_ldexpf(float __a, int __b);
+__device__ double __nv_lgamma(double __a);
+__device__ float __nv_lgammaf(float __a);
+__device__ double __nv_ll2double_rd(long long __a);
+__device__ double __nv_ll2double_rn(long long __a);
+__device__ double __nv_ll2double_ru(long long __a);
+__device__ double __nv_ll2double_rz(long long __a);
+__device__ float __nv_ll2float_rd(long long __a);
+__device__ float __nv_ll2float_rn(long long __a);
+__device__ float __nv_ll2float_ru(long long __a);
+__device__ float __nv_ll2float_rz(long long __a);
+__device__ long long __nv_llabs(long long __a);
+__device__ long long __nv_llmax(long long __a, long long __b);
+__device__ long long __nv_llmin(long long __a, long long __b);
+__device__ long long __nv_llrint(double __a);
+__device__ long long __nv_llrintf(float __a);
+__device__ long long __nv_llround(double __a);
+__device__ long long __nv_llroundf(float __a);
+__device__ double __nv_log10(double __a);
+__device__ float __nv_log10f(float __a);
+__device__ double __nv_log1p(double __a);
+__device__ float __nv_log1pf(float __a);
+__device__ double __nv_log2(double __a);
+__device__ float __nv_log2f(float __a);
+__device__ double __nv_logb(double __a);
+__device__ float __nv_logbf(float __a);
+__device__ double __nv_log(double __a);
+__device__ float __nv_logf(float __a);
+__device__ double __nv_longlong_as_double(long long __a);
+__device__ int __nv_max(int __a, int __b);
+__device__ int __nv_min(int __a, int __b);
+__device__ double __nv_modf(double __a, double *__b);
+__device__ float __nv_modff(float __a, float *__b);
+__device__ int __nv_mul24(int __a, int __b);
+__device__ long long __nv_mul64hi(long long __a, long long __b);
+__device__ int __nv_mulhi(int __a, int __b);
+__device__ double __nv_nan(const signed char *__a);
+__device__ float __nv_nanf(const signed char *__a);
+__device__ double __nv_nearbyint(double __a);
+__device__ float __nv_nearbyintf(float __a);
+__device__ double __nv_nextafter(double __a, double __b);
+__device__ float __nv_nextafterf(float __a, float __b);
+__device__ double __nv_norm3d(double __a, double __b, double __c);
+__device__ float __nv_norm3df(float __a, float __b, float __c);
+__device__ double __nv_norm4d(double __a, double __b, double __c, double __d);
+__device__ float __nv_norm4df(float __a, float __b, float __c, float __d);
+__device__ double __nv_normcdf(double __a);
+__device__ float __nv_normcdff(float __a);
+__device__ double __nv_normcdfinv(double __a);
+__device__ float __nv_normcdfinvf(float __a);
+__device__ float __nv_normf(int __a, const float *__b);
+__device__ double __nv_norm(int __a, const double *__b);
+__device__ int __nv_popc(int __a);
+__device__ int __nv_popcll(long long __a);
+__device__ double __nv_pow(double __a, double __b);
+__device__ float __nv_powf(float __a, float __b);
+__device__ double __nv_powi(double __a, int __b);
+__device__ float __nv_powif(float __a, int __b);
+__device__ double __nv_rcbrt(double __a);
+__device__ float __nv_rcbrtf(float __a);
+__device__ double __nv_rcp64h(double __a);
+__device__ double __nv_remainder(double __a, double __b);
+__device__ float __nv_remainderf(float __a, float __b);
+__device__ double __nv_remquo(double __a, double __b, int *__c);
+__device__ float __nv_remquof(float __a, float __b, int *__c);
+__device__ int __nv_rhadd(int __a, int __b);
+__device__ double __nv_rhypot(double __a, double __b);
+__device__ float __nv_rhypotf(float __a, float __b);
+__device__ double __nv_rint(double __a);
+__device__ float __nv_rintf(float __a);
+__device__ double __nv_rnorm3d(double __a, double __b, double __c);
+__device__ float __nv_rnorm3df(float __a, float __b, float __c);
+__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
+__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
+__device__ float __nv_rnormf(int __a, const float *__b);
+__device__ double __nv_rnorm(int __a, const double *__b);
+__device__ double __nv_round(double __a);
+__device__ float __nv_roundf(float __a);
+__device__ double __nv_rsqrt(double __a);
+__device__ float __nv_rsqrtf(float __a);
+__device__ int __nv_sad(int __a, int __b, int __c);
+__device__ float __nv_saturatef(float __a);
+__device__ double __nv_scalbn(double __a, int __b);
+__device__ float __nv_scalbnf(float __a, int __b);
+__device__ int __nv_signbitd(double __a);
+__device__ int __nv_signbitf(float __a);
+__device__ void __nv_sincos(double __a, double *__b, double *__c);
+__device__ void __nv_sincosf(float __a, float *__b, float *__c);
+__device__ void __nv_sincospi(double __a, double *__b, double *__c);
+__device__ void __nv_sincospif(float __a, float *__b, float *__c);
+__device__ double __nv_sin(double __a);
+__device__ float __nv_sinf(float __a);
+__device__ double __nv_sinh(double __a);
+__device__ float __nv_sinhf(float __a);
+__device__ double __nv_sinpi(double __a);
+__device__ float __nv_sinpif(float __a);
+__device__ double __nv_sqrt(double __a);
+__device__ float __nv_sqrtf(float __a);
+__device__ double __nv_tan(double __a);
+__device__ float __nv_tanf(float __a);
+__device__ double __nv_tanh(double __a);
+__device__ float __nv_tanhf(float __a);
+__device__ double __nv_tgamma(double __a);
+__device__ float __nv_tgammaf(float __a);
+__device__ double __nv_trunc(double __a);
+__device__ float __nv_truncf(float __a);
+__device__ int __nv_uhadd(unsigned int __a, unsigned int __b);
+__device__ double __nv_uint2double_rn(unsigned int __i);
+__device__ float __nv_uint2float_rd(unsigned int __a);
+__device__ float __nv_uint2float_rn(unsigned int __a);
+__device__ float __nv_uint2float_ru(unsigned int __a);
+__device__ float __nv_uint2float_rz(unsigned int __a);
+__device__ float __nv_uint_as_float(unsigned int __a);
+__device__ double __nv_ull2double_rd(unsigned long long __a);
+__device__ double __nv_ull2double_rn(unsigned long long __a);
+__device__ double __nv_ull2double_ru(unsigned long long __a);
+__device__ double __nv_ull2double_rz(unsigned long long __a);
+__device__ float __nv_ull2float_rd(unsigned long long __a);
+__device__ float __nv_ull2float_rn(unsigned long long __a);
+__device__ float __nv_ull2float_ru(unsigned long long __a);
+__device__ float __nv_ull2float_rz(unsigned long long __a);
+__device__ unsigned long long __nv_ullmax(unsigned long long __a,
+                                          unsigned long long __b);
+__device__ unsigned long long __nv_ullmin(unsigned long long __a,
+                                          unsigned long long __b);
+__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
+__device__ unsigned long long __nv_umul64hi(unsigned long long __a,
+                                            unsigned long long __b);
+__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
+                                  unsigned int __c);
+#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
+__device__ int __nv_vabs2(int __a);
+__device__ int __nv_vabs4(int __a);
+__device__ int __nv_vabsdiffs2(int __a, int __b);
+__device__ int __nv_vabsdiffs4(int __a, int __b);
+__device__ int __nv_vabsdiffu2(int __a, int __b);
+__device__ int __nv_vabsdiffu4(int __a, int __b);
+__device__ int __nv_vabsss2(int __a);
+__device__ int __nv_vabsss4(int __a);
+__device__ int __nv_vadd2(int __a, int __b);
+__device__ int __nv_vadd4(int __a, int __b);
+__device__ int __nv_vaddss2(int __a, int __b);
+__device__ int __nv_vaddss4(int __a, int __b);
+__device__ int __nv_vaddus2(int __a, int __b);
+__device__ int __nv_vaddus4(int __a, int __b);
+__device__ int __nv_vavgs2(int __a, int __b);
+__device__ int __nv_vavgs4(int __a, int __b);
+__device__ int __nv_vavgu2(int __a, int __b);
+__device__ int __nv_vavgu4(int __a, int __b);
+__device__ int __nv_vcmpeq2(int __a, int __b);
+__device__ int __nv_vcmpeq4(int __a, int __b);
+__device__ int __nv_vcmpges2(int __a, int __b);
+__device__ int __nv_vcmpges4(int __a, int __b);
+__device__ int __nv_vcmpgeu2(int __a, int __b);
+__device__ int __nv_vcmpgeu4(int __a, int __b);
+__device__ int __nv_vcmpgts2(int __a, int __b);
+__device__ int __nv_vcmpgts4(int __a, int __b);
+__device__ int __nv_vcmpgtu2(int __a, int __b);
+__device__ int __nv_vcmpgtu4(int __a, int __b);
+__device__ int __nv_vcmples2(int __a, int __b);
+__device__ int __nv_vcmples4(int __a, int __b);
+__device__ int __nv_vcmpleu2(int __a, int __b);
+__device__ int __nv_vcmpleu4(int __a, int __b);
+__device__ int __nv_vcmplts2(int __a, int __b);
+__device__ int __nv_vcmplts4(int __a, int __b);
+__device__ int __nv_vcmpltu2(int __a, int __b);
+__device__ int __nv_vcmpltu4(int __a, int __b);
+__device__ int __nv_vcmpne2(int __a, int __b);
+__device__ int __nv_vcmpne4(int __a, int __b);
+__device__ int __nv_vhaddu2(int __a, int __b);
+__device__ int __nv_vhaddu4(int __a, int __b);
+__device__ int __nv_vmaxs2(int __a, int __b);
+__device__ int __nv_vmaxs4(int __a, int __b);
+__device__ int __nv_vmaxu2(int __a, int __b);
+__device__ int __nv_vmaxu4(int __a, int __b);
+__device__ int __nv_vmins2(int __a, int __b);
+__device__ int __nv_vmins4(int __a, int __b);
+__device__ int __nv_vminu2(int __a, int __b);
+__device__ int __nv_vminu4(int __a, int __b);
+__device__ int __nv_vneg2(int __a);
+__device__ int __nv_vneg4(int __a);
+__device__ int __nv_vnegss2(int __a);
+__device__ int __nv_vnegss4(int __a);
+__device__ int __nv_vsads2(int __a, int __b);
+__device__ int __nv_vsads4(int __a, int __b);
+__device__ int __nv_vsadu2(int __a, int __b);
+__device__ int __nv_vsadu4(int __a, int __b);
+__device__ int __nv_vseteq2(int __a, int __b);
+__device__ int __nv_vseteq4(int __a, int __b);
+__device__ int __nv_vsetges2(int __a, int __b);
+__device__ int __nv_vsetges4(int __a, int __b);
+__device__ int __nv_vsetgeu2(int __a, int __b);
+__device__ int __nv_vsetgeu4(int __a, int __b);
+__device__ int __nv_vsetgts2(int __a, int __b);
+__device__ int __nv_vsetgts4(int __a, int __b);
+__device__ int __nv_vsetgtu2(int __a, int __b);
+__device__ int __nv_vsetgtu4(int __a, int __b);
+__device__ int __nv_vsetles2(int __a, int __b);
+__device__ int __nv_vsetles4(int __a, int __b);
+__device__ int __nv_vsetleu2(int __a, int __b);
+__device__ int __nv_vsetleu4(int __a, int __b);
+__device__ int __nv_vsetlts2(int __a, int __b);
+__device__ int __nv_vsetlts4(int __a, int __b);
+__device__ int __nv_vsetltu2(int __a, int __b);
+__device__ int __nv_vsetltu4(int __a, int __b);
+__device__ int __nv_vsetne2(int __a, int __b);
+__device__ int __nv_vsetne4(int __a, int __b);
+__device__ int __nv_vsub2(int __a, int __b);
+__device__ int __nv_vsub4(int __a, int __b);
+__device__ int __nv_vsubss2(int __a, int __b);
+__device__ int __nv_vsubss4(int __a, int __b);
+__device__ int __nv_vsubus2(int __a, int __b);
+__device__ int __nv_vsubus4(int __a, int __b);
+#endif  // CUDA_VERSION
+__device__ double __nv_y0(double __a);
+__device__ float __nv_y0f(float __a);
+__device__ double __nv_y1(double __a);
+__device__ float __nv_y1f(float __a);
+__device__ float __nv_ynf(int __a, float __b);
+__device__ double __nv_yn(int __a, double __b);
+} // extern "C"
+#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__
diff --git a/darwin-x86/clang-headers/__clang_cuda_math_forward_declares.h b/darwin-x86/clang-headers/__clang_cuda_math_forward_declares.h
index 3f2834d..c31b1f4 100644
--- a/darwin-x86/clang-headers/__clang_cuda_math_forward_declares.h
+++ b/darwin-x86/clang-headers/__clang_cuda_math_forward_declares.h
@@ -140,6 +140,7 @@
 __DEVICE__ long lrint(float);
 __DEVICE__ long lround(double);
 __DEVICE__ long lround(float);
+__DEVICE__ long long llround(float); // No llround(double).
 __DEVICE__ double modf(double, double *);
 __DEVICE__ float modf(float, float *);
 __DEVICE__ double nan(const char *);
@@ -148,8 +149,6 @@
 __DEVICE__ float nearbyint(float);
 __DEVICE__ double nextafter(double, double);
 __DEVICE__ float nextafter(float, float);
-__DEVICE__ double nexttoward(double, double);
-__DEVICE__ float nexttoward(float, float);
 __DEVICE__ double pow(double, double);
 __DEVICE__ double pow(double, int);
 __DEVICE__ float pow(float, float);
@@ -183,7 +182,23 @@
 __DEVICE__ double trunc(double);
 __DEVICE__ float trunc(float);
 
+// Notably missing above is nexttoward, which we don't define on
+// the device side because libdevice doesn't give us an implementation, and we
+// don't want to be in the business of writing one ourselves.
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
 namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
 using ::abs;
 using ::acos;
 using ::acosh;
@@ -235,12 +250,12 @@
 using ::logb;
 using ::lrint;
 using ::lround;
+using ::llround;
 using ::modf;
 using ::nan;
 using ::nanf;
 using ::nearbyint;
 using ::nextafter;
-using ::nexttoward;
 using ::pow;
 using ::remainder;
 using ::remquo;
@@ -256,7 +271,15 @@
 using ::tanh;
 using ::tgamma;
 using ::trunc;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
 } // namespace std
+#endif
 
 #pragma pop_macro("__DEVICE__")
 
diff --git a/darwin-x86/clang-headers/__clang_cuda_runtime_wrapper.h b/darwin-x86/clang-headers/__clang_cuda_runtime_wrapper.h
index 6445f9b..09705a2 100644
--- a/darwin-x86/clang-headers/__clang_cuda_runtime_wrapper.h
+++ b/darwin-x86/clang-headers/__clang_cuda_runtime_wrapper.h
@@ -62,7 +62,7 @@
 #include "cuda.h"
 #if !defined(CUDA_VERSION)
 #error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9020
 #error "Unsupported CUDA version!"
 #endif
 
@@ -72,9 +72,9 @@
 #define __CUDA_ARCH__ 350
 #endif
 
-#include "cuda_builtin_vars.h"
+#include "__clang_cuda_builtin_vars.h"
 
-// No need for device_launch_parameters.h as cuda_builtin_vars.h above
+// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
 // has taken care of builtin variables declared in the file.
 #define __DEVICE_LAUNCH_PARAMETERS_H__
 
@@ -84,19 +84,33 @@
 #define __DEVICE_FUNCTIONS_H__
 #define __MATH_FUNCTIONS_H__
 #define __COMMON_FUNCTIONS_H__
+// device_functions_decls is replaced by __clang_cuda_device_functions.h
+// included below.
+#define __DEVICE_FUNCTIONS_DECLS_H__
 
 #undef __CUDACC__
+#if CUDA_VERSION < 9000
 #define __CUDABE__
+#else
+#define __CUDA_LIBDEVICE__
+#endif
 // Disables definitions of device-side runtime support stubs in
 // cuda_device_runtime_api.h
 #include "driver_types.h"
 #include "host_config.h"
 #include "host_defines.h"
 
+// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
+// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
+// functional equivalent of what we need.
+#pragma push_macro("nv_weak")
+#define nv_weak weak
 #undef __CUDABE__
+#undef __CUDA_LIBDEVICE__
 #define __CUDACC__
 #include "cuda_runtime.h"
 
+#pragma pop_macro("nv_weak")
 #undef __CUDACC__
 #define __CUDABE__
 
@@ -105,7 +119,9 @@
 #define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
 #define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
 
+#if CUDA_VERSION < 9000
 #include "crt/device_runtime.h"
+#endif
 #include "crt/host_runtime.h"
 // device_runtime.h defines __cxa_* macros that will conflict with
 // cxxabi.h.
@@ -113,6 +129,7 @@
 #undef __cxa_vec_ctor
 #undef __cxa_vec_cctor
 #undef __cxa_vec_dtor
+#undef __cxa_vec_new
 #undef __cxa_vec_new2
 #undef __cxa_vec_new3
 #undef __cxa_vec_delete2
@@ -120,19 +137,50 @@
 #undef __cxa_vec_delete3
 #undef __cxa_pure_virtual
 
-// We need decls for functions in CUDA's libdevice with __device__
-// attribute only. Alas they come either as __host__ __device__ or
-// with no attributes at all. To work around that, define __CUDA_RTC__
-// which produces HD variant and undef __host__ which gives us desided
-// decls with __device__ attribute.
-#pragma push_macro("__host__")
-#define __host__
-#define __CUDACC_RTC__
-#include "device_functions_decls.h"
-#undef __CUDACC_RTC__
+// math_functions.hpp expects this host function be defined on MacOS, but it
+// ends up not being there because of the games we play here.  Just define it
+// ourselves; it's simple enough.
+#ifdef __APPLE__
+inline __host__ double __signbitd(double x) {
+  return std::signbit(x);
+}
+#endif
+
+// CUDA 9.1 no longer provides declarations for libdevice functions, so we need
+// to provide our own.
+#include <__clang_cuda_libdevice_declares.h>
+
+// Wrappers for many device-side standard library functions became compiler
+// builtins in CUDA-9 and have been removed from the CUDA headers. Clang now
+// provides its own implementation of the wrappers.
+#if CUDA_VERSION >= 9000
+#include <__clang_cuda_device_functions.h>
+#endif
+
+// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
+// counterpart does not do it, so we need to make it empty here to keep
+// following CUDA includes happy.
+#undef __THROW
+#define __THROW
+
+// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
+// Previous versions used to check whether they are defined or not.
+// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
+// here to detect the switch.
+
+#if defined(CU_DEVICE_INVALID)
+#if !defined(__USE_FAST_MATH__)
+#define __USE_FAST_MATH__ 0
+#endif
+
+#if !defined(__CUDA_PREC_DIV)
+#define __CUDA_PREC_DIV 0
+#endif
+#endif
 
 // Temporarily poison __host__ macro to ensure it's not used by any of
 // the headers we're about to include.
+#pragma push_macro("__host__")
 #define __host__ UNEXPECTED_HOST_ATTRIBUTE
 
 // device_functions.hpp and math_functions*.hpp use 'static
@@ -141,7 +189,9 @@
 // __device__.
 #pragma push_macro("__forceinline__")
 #define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+#if CUDA_VERSION < 9000
 #include "device_functions.hpp"
+#endif
 
 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
@@ -151,19 +201,34 @@
 // slow divides), so we need to scope our define carefully here.
 #pragma push_macro("__USE_FAST_MATH__")
 #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
-#define __USE_FAST_MATH__
+#define __USE_FAST_MATH__ 1
 #endif
+
+#if CUDA_VERSION >= 9000
+// CUDA-9.2 needs host-side memcpy for some host functions in
+// device_functions.hpp
+#if CUDA_VERSION >= 9020
+#include <string.h>
+#endif
+#include "crt/math_functions.hpp"
+#else
 #include "math_functions.hpp"
+#endif
+
 #pragma pop_macro("__USE_FAST_MATH__")
 
+#if CUDA_VERSION < 9000
 #include "math_functions_dbl_ptx3.hpp"
+#endif
 #pragma pop_macro("__forceinline__")
 
 // Pull in host-only functions that are only available when neither
 // __CUDACC__ nor __CUDABE__ are defined.
 #undef __MATH_FUNCTIONS_HPP__
 #undef __CUDABE__
+#if CUDA_VERSION < 9000
 #include "math_functions.hpp"
+#endif
 // Alas, additional overloads for these functions are hard to get to.
 // Considering that we only need these overloads for a few functions,
 // we can provide them here.
@@ -179,22 +244,36 @@
 static inline float normcdf(float __a) { return normcdff(__a); }
 static inline float erfcx(float __a) { return erfcxf(__a); }
 
+#if CUDA_VERSION < 9000
 // For some reason single-argument variant is not always declared by
 // CUDA headers. Alas, device_functions.hpp included below needs it.
 static inline __device__ void __brkpt(int __c) { __brkpt(); }
+#endif
 
 // Now include *.hpp with definitions of various GPU functions.  Alas,
 // a lot of thins get declared/defined with __host__ attribute which
 // we don't want and we have to define it out. We also have to include
 // {device,math}_functions.hpp again in order to extract the other
 // branch of #if/else inside.
-
 #define __host__
 #undef __CUDABE__
 #define __CUDACC__
+#if CUDA_VERSION >= 9000
+// Some atomic functions became compiler builtins in CUDA-9 , so we need their
+// declarations.
+#include "device_atomic_functions.h"
+#endif
 #undef __DEVICE_FUNCTIONS_HPP__
 #include "device_atomic_functions.hpp"
+#if CUDA_VERSION >= 9000
+#include "crt/device_functions.hpp"
+#include "crt/device_double_functions.hpp"
+#else
 #include "device_functions.hpp"
+#define __CUDABE__
+#include "device_double_functions.h"
+#undef __CUDABE__
+#endif
 #include "sm_20_atomic_functions.hpp"
 #include "sm_20_intrinsics.hpp"
 #include "sm_32_atomic_functions.hpp"
@@ -207,6 +286,14 @@
 // hardware, seems to generate faster machine code because ptxas can more easily
 // reason about our code.
 
+#if CUDA_VERSION >= 8000
+#pragma push_macro("__CUDA_ARCH__")
+#undef __CUDA_ARCH__
+#include "sm_60_atomic_functions.hpp"
+#include "sm_61_intrinsics.hpp"
+#pragma pop_macro("__CUDA_ARCH__")
+#endif
+
 #undef __MATH_FUNCTIONS_HPP__
 
 // math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
@@ -217,7 +304,27 @@
 #pragma push_macro("__GNUC__")
 #undef __GNUC__
 #define signbit __ignored_cuda_signbit
+
+// CUDA-9 omits device-side definitions of some math functions if it sees
+// include guard from math.h wrapper from libstdc++. We have to undo the header
+// guard temporarily to get the definitions we need.
+#pragma push_macro("_GLIBCXX_MATH_H")
+#pragma push_macro("_LIBCPP_VERSION")
+#if CUDA_VERSION >= 9000
+#undef _GLIBCXX_MATH_H
+// We also need to undo another guard that checks for libc++ 3.8+
+#ifdef _LIBCPP_VERSION
+#define _LIBCPP_VERSION 3700
+#endif
+#endif
+
+#if CUDA_VERSION >= 9000
+#include "crt/math_functions.hpp"
+#else
 #include "math_functions.hpp"
+#endif
+#pragma pop_macro("_GLIBCXX_MATH_H")
+#pragma pop_macro("_LIBCPP_VERSION")
 #pragma pop_macro("__GNUC__")
 #pragma pop_macro("signbit")
 
@@ -267,8 +374,8 @@
 }
 } // namespace std
 
-// Out-of-line implementations from cuda_builtin_vars.h.  These need to come
-// after we've pulled in the definition of uint3 and dim3.
+// Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
+// come after we've pulled in the definition of uint3 and dim3.
 
 __device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
   uint3 ret;
@@ -296,13 +403,14 @@
 
 #include <__clang_cuda_cmath.h>
 #include <__clang_cuda_intrinsics.h>
+#include <__clang_cuda_complex_builtins.h>
 
 // curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
 // mode, giving them their "proper" types of dim3 and uint3.  This is
-// incompatible with the types we give in cuda_builtin_vars.h.  As as hack,
-// force-include the header (nvcc doesn't include it by default) but redefine
-// dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are only
-// used here for the redeclarations of blockDim and threadIdx.)
+// incompatible with the types we give in __clang_cuda_builtin_vars.h.  As as
+// hack, force-include the header (nvcc doesn't include it by default) but
+// redefine dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are
+// only used here for the redeclarations of blockDim and threadIdx.)
 #pragma push_macro("dim3")
 #pragma push_macro("uint3")
 #define dim3 __cuda_builtin_blockDim_t
diff --git a/darwin-x86/clang-headers/__wmmintrin_aes.h b/darwin-x86/clang-headers/__wmmintrin_aes.h
index 211518e..70c355e 100644
--- a/darwin-x86/clang-headers/__wmmintrin_aes.h
+++ b/darwin-x86/clang-headers/__wmmintrin_aes.h
@@ -20,22 +20,25 @@
  *
  *===-----------------------------------------------------------------------===
  */
-#ifndef _WMMINTRIN_AES_H
-#define _WMMINTRIN_AES_H
 
-#include <emmintrin.h>
+#ifndef __WMMINTRIN_H
+#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
+#endif
+
+#ifndef __WMMINTRIN_AES_H
+#define __WMMINTRIN_AES_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
 
-/// \brief Performs a single round of AES encryption using the Equivalent
+/// Performs a single round of AES encryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESENC instruction.
+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
@@ -48,14 +51,14 @@
   return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
 }
 
-/// \brief Performs the final round of AES encryption using the Equivalent
+/// Performs the final round of AES encryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESENCLAST instruction.
+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
@@ -68,14 +71,14 @@
   return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
 }
 
-/// \brief Performs a single round of AES decryption using the Equivalent
+/// Performs a single round of AES decryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESDEC instruction.
+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
@@ -88,14 +91,14 @@
   return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
 }
 
-/// \brief Performs the final round of AES decryption using the Equivalent
+/// Performs the final round of AES decryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESDECLAST instruction.
+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
@@ -108,13 +111,13 @@
   return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
 }
 
-/// \brief Applies the AES InvMixColumns() transformation to an expanded key
+/// Applies the AES InvMixColumns() transformation to an expanded key
 ///    contained in the source operand, and writes the result to the
 ///    destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VAESIMC instruction.
+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the expanded key.
@@ -125,7 +128,7 @@
   return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
 }
 
-/// \brief Generates a round key for AES encyption, operating on 128-bit data
+/// Generates a round key for AES encryption, operating on 128-bit data
 ///    specified in the first source operand and using an 8-bit round constant
 ///    specified by the second source operand, and writes the result to the
 ///    destination.
@@ -136,7 +139,7 @@
 /// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c AESKEYGENASSIST instruction.
+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
 ///
 /// \param C
 ///    A 128-bit integer vector that is used to generate the AES encryption key.
@@ -148,4 +151,4 @@
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif  /* _WMMINTRIN_AES_H */
+#endif  /* __WMMINTRIN_AES_H */
diff --git a/darwin-x86/clang-headers/__wmmintrin_pclmul.h b/darwin-x86/clang-headers/__wmmintrin_pclmul.h
index d4e073f..e0f9287 100644
--- a/darwin-x86/clang-headers/__wmmintrin_pclmul.h
+++ b/darwin-x86/clang-headers/__wmmintrin_pclmul.h
@@ -20,10 +20,15 @@
  *
  *===-----------------------------------------------------------------------===
  */
-#ifndef _WMMINTRIN_PCLMUL_H
-#define _WMMINTRIN_PCLMUL_H
 
-/// \brief Multiplies two 64-bit integer values, which are selected from source
+#ifndef __WMMINTRIN_H
+#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
+#endif
+
+#ifndef __WMMINTRIN_PCLMUL_H
+#define __WMMINTRIN_PCLMUL_H
+
+/// Multiplies two 64-bit integer values, which are selected from source
 ///    operands using the immediate-value operand. The multiplication is a
 ///    carry-less multiplication, and the 128-bit integer product is stored in
 ///    the destination.
@@ -34,7 +39,7 @@
 /// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPCLMULQDQ instruction.
+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
 ///
 /// \param __X
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
@@ -42,17 +47,16 @@
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
 /// \param __I
 ///    An immediate value specifying which 64-bit values to select from the
-///    operands.
-///    Bit 0 is used to select a value from operand __X,
-///    and bit 4 is used to select a value from operand __Y:
-///    Bit[0]=0 indicates that bits[63:0] of operand __X are used.
-///    Bit[0]=1 indicates that bits[127:64] of operand __X are used.
-///    Bit[4]=0 indicates that bits[63:0] of operand __Y are used.
-///    Bit[4]=1 indicates that bits[127:64] of operand __Y are used.
+///    operands. Bit 0 is used to select a value from operand \a __X, and bit
+///    4 is used to select a value from operand \a __Y: \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
+///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
 /// \returns The 128-bit integer vector containing the result of the carry-less
 ///    multiplication of the selected 64-bit values.
-#define _mm_clmulepi64_si128(__X, __Y, __I) \
-  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
-                                        (__v2di)(__m128i)(__Y), (char)(__I)))
+#define _mm_clmulepi64_si128(X, Y, I) \
+  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
+                                        (__v2di)(__m128i)(Y), (char)(I)))
 
-#endif /* _WMMINTRIN_PCLMUL_H */
+#endif /* __WMMINTRIN_PCLMUL_H */
diff --git a/darwin-x86/clang-headers/altivec.h b/darwin-x86/clang-headers/altivec.h
index 74a1914..90fd477 100644
--- a/darwin-x86/clang-headers/altivec.h
+++ b/darwin-x86/clang-headers/altivec.h
@@ -34,8 +34,31 @@
 #define __CR6_LT 2
 #define __CR6_LT_REV 3
 
+/* Constants for vec_test_data_class */
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \
+                                  __VEC_CLASS_FP_SUBNORMAL_N)
+#define __VEC_CLASS_FP_ZERO_N (1<<2)
+#define __VEC_CLASS_FP_ZERO_P (1<<3)
+#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P           | \
+                             __VEC_CLASS_FP_ZERO_N)
+#define __VEC_CLASS_FP_INFINITY_N (1<<4)
+#define __VEC_CLASS_FP_INFINITY_P (1<<5)
+#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P   | \
+                                 __VEC_CLASS_FP_INFINITY_N)
+#define __VEC_CLASS_FP_NAN (1<<6)
+#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN        | \
+                                   __VEC_CLASS_FP_SUBNORMAL  | \
+                                   __VEC_CLASS_FP_ZERO       | \
+                                   __VEC_CLASS_FP_INFINITY)
+
 #define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
 
+#ifdef __POWER9_VECTOR__
+#include <stddef.h>
+#endif
+
 static __inline__ vector signed char __ATTRS_o_ai vec_perm(
     vector signed char __a, vector signed char __b, vector unsigned char __c);
 
@@ -134,7 +157,7 @@
 #endif
 }
 
-#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+#ifdef __VSX__
 static __inline__ vector double __ATTRS_o_ai vec_abs(vector double __a) {
   return __builtin_vsx_xvabsdp(__a);
 }
@@ -163,6 +186,26 @@
       __a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
 }
 
+/* vec_absd */
+#if defined(__POWER9_VECTOR__)
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_absd(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vabsdub(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_absd(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vabsduh(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_absd(vector unsigned int __a,  vector unsigned int __b) {
+  return __builtin_altivec_vabsduw(__a, __b);
+}
+
+#endif /* End __POWER9_VECTOR__ */
+
 /* vec_add */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -305,6 +348,22 @@
 }
 #endif
 
+static __inline__ vector signed int __ATTRS_o_ai
+vec_adde(vector signed int __a, vector signed int __b,
+         vector signed int __c) {
+  vector signed int __mask = {1, 1, 1, 1};
+  vector signed int __carry = __c & __mask;
+  return vec_add(vec_add(__a, __b), __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adde(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int __mask = {1, 1, 1, 1};
+  vector unsigned int __carry = __c & __mask;
+  return vec_add(vec_add(__a, __b), __carry);
+}
+
 /* vec_addec */
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
@@ -319,6 +378,50 @@
           vector unsigned __int128 __c) {
   return __builtin_altivec_vaddecuq(__a, __b, __c);
 }
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_addec(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+
+  signed int __result[4];
+  for (int i = 0; i < 4; i++) {
+    unsigned int __tempa = (unsigned int) __a[i];
+    unsigned int __tempb = (unsigned int) __b[i];
+    unsigned int __tempc = (unsigned int) __c[i];
+    __tempc = __tempc & 0x00000001;
+    unsigned long long __longa = (unsigned long long) __tempa;
+    unsigned long long __longb = (unsigned long long) __tempb;
+    unsigned long long __longc = (unsigned long long) __tempc;
+    unsigned long long __sum = __longa + __longb + __longc;
+    unsigned long long __res = (__sum >> 32) & 0x01;
+    unsigned long long __tempres = (unsigned int) __res;
+    __result[i] = (signed int) __tempres;
+  }
+
+  vector signed int ret = { __result[0], __result[1], __result[2], __result[3] };
+  return ret;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_addec(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+
+  unsigned int __result[4];
+  for (int i = 0; i < 4; i++) {
+    unsigned int __tempc = __c[i] & 1;
+    unsigned long long __longa = (unsigned long long) __a[i];
+    unsigned long long __longb = (unsigned long long) __b[i];
+    unsigned long long __longc = (unsigned long long) __tempc;
+    unsigned long long __sum = __longa + __longb + __longc;
+    unsigned long long __res = (__sum >> 32) & 0x01;
+    unsigned long long __tempres = (unsigned int) __res;
+    __result[i] = (signed int) __tempres;
+  }
+
+  vector unsigned int ret = { __result[0], __result[1], __result[2], __result[3] };
+  return ret;
+}
+
 #endif
 
 /* vec_vaddubm */
@@ -1544,6 +1647,12 @@
                                                       (vector char)__b);
 }
 
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
 static __inline__ vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
                                                            vector short __b) {
   return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
@@ -1555,6 +1664,12 @@
                                                        (vector short)__b);
 }
 
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+                                                       (vector short)__b);
+}
+
 static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a,
                                                          vector int __b) {
   return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
@@ -1566,6 +1681,12 @@
                                                      (vector int)__b);
 }
 
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector bool int __a,
+                                                         vector bool int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+                                                     (vector int)__b);
+}
+
 #ifdef __POWER8_VECTOR__
 static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmpeq(vector signed long long __a, vector signed long long __b) {
@@ -1577,6 +1698,13 @@
   return (vector bool long long)__builtin_altivec_vcmpequd(
       (vector long long)__a, (vector long long)__b);
 }
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(
+      (vector long long)__a, (vector long long)__b);
+}
+
 #endif
 
 static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
@@ -1595,6 +1723,199 @@
 }
 #endif
 
+#ifdef __POWER9_VECTOR__
+/* vec_cmpne */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector double __a, vector double __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+/* vec_cmpnez */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector signed short __a, vector signed short __b) {
+  return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector signed int __a, vector signed int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vctzlsbb(__a);
+#else
+  return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vctzlsbb(__a);
+#else
+  return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclzlsbb(__a);
+#else
+  return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclzlsbb(__a);
+#else
+  return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned int __a) {
+  return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector signed int __a) {
+  return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned __int128 __a) {
+  return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector signed __int128 __a) {
+  return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned long long __a) {
+  return __builtin_altivec_vprtybd(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector signed long long __a) {
+  return __builtin_altivec_vprtybd(__a);
+}
+
+#endif
+
 /* vec_cmpgt */
 
 static __inline__ vector bool char __ATTRS_o_ai
@@ -1882,6 +2203,41 @@
   return vec_cmpgt(__b, __a);
 }
 
+/* vec_popcnt */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_popcnt(vector signed char __a) {
+  return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_popcnt(vector signed short __a) {
+  return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_popcnt(vector signed int __a) {
+  return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_popcnt(vector signed long long __a) {
+  return __builtin_altivec_vpopcntd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_altivec_vpopcntd(__a);
+}
+
 /* vec_cntlz */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -1918,6 +2274,603 @@
 }
 #endif
 
+#ifdef __POWER9_VECTOR__
+
+/* vec_cnttz */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_cnttz(vector signed char __a) {
+  return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_cnttz(vector signed short __a) {
+  return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_cnttz(vector signed int __a) {
+  return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cnttz(vector signed long long __a) {
+  return __builtin_altivec_vctzd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_altivec_vctzd(__a);
+}
+
+/* vec_first_match_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed char __a, vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned char __a, vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed short __a, vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_match_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed char __a, vector signed char __b) {
+  /* Compare the result of the comparison of two vectors with either and OR the
+     result. Either the elements are equal or one will equal the comparison
+     result if either is zero.
+  */
+  vector bool char __tmp1 = vec_cmpeq(__a, __b);
+  vector bool char __tmp2 = __tmp1 |
+                            vec_cmpeq((vector signed char)__tmp1, __a) |
+                            vec_cmpeq((vector signed char)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned char __a,
+                             vector unsigned char __b) {
+  vector bool char __tmp1 = vec_cmpeq(__a, __b);
+  vector bool char __tmp2 = __tmp1 |
+                            vec_cmpeq((vector unsigned char)__tmp1, __a) |
+                            vec_cmpeq((vector unsigned char)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed short __a, vector signed short __b) {
+  vector bool short __tmp1 = vec_cmpeq(__a, __b);
+  vector bool short __tmp2 = __tmp1 |
+                             vec_cmpeq((vector signed short)__tmp1, __a) |
+                             vec_cmpeq((vector signed short)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned short __a,
+                             vector unsigned short __b) {
+  vector bool short __tmp1 = vec_cmpeq(__a, __b);
+  vector bool short __tmp2 = __tmp1 |
+                             vec_cmpeq((vector unsigned short)__tmp1, __a) |
+                             vec_cmpeq((vector unsigned short)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed int __a, vector signed int __b) {
+  vector bool int __tmp1 = vec_cmpeq(__a, __b);
+  vector bool int __tmp2 = __tmp1 | vec_cmpeq((vector signed int)__tmp1, __a) |
+                           vec_cmpeq((vector signed int)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned int __a, vector unsigned int __b) {
+  vector bool int __tmp1 = vec_cmpeq(__a, __b);
+  vector bool int __tmp2 = __tmp1 |
+                           vec_cmpeq((vector unsigned int)__tmp1, __a) |
+                           vec_cmpeq((vector unsigned int)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)__tmp2);
+#else
+    vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed char __a, vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned char __a, vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed short __a, vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed char __a,
+                                vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned char __a,
+                                vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed short __a,
+                                vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned short __a,
+                                vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned int __a,
+                                vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ vector double  __ATTRS_o_ai
+vec_insert_exp(vector double __a, vector unsigned long long __b) {
+  return __builtin_vsx_xviexpdp((vector unsigned long long)__a,__b);
+}
+
+static __inline__ vector double  __ATTRS_o_ai
+vec_insert_exp(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_vsx_xviexpdp(__a,__b);
+}
+
+static __inline__ vector float  __ATTRS_o_ai
+vec_insert_exp(vector float __a, vector unsigned int __b) {
+  return __builtin_vsx_xviexpsp((vector unsigned int)__a,__b);
+}
+
+static __inline__ vector float  __ATTRS_o_ai
+vec_insert_exp(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_vsx_xviexpsp(__a,__b);
+}
+
+#if defined(__powerpc64__)
+static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a,
+                                                             size_t __b) {
+  return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_len(unsigned char *__a, size_t __b) {
+  return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a,
+                                                              size_t __b) {
+  return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_len(unsigned short *__a, size_t __b) {
+  return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a,
+                                                            size_t __b) {
+  return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a,
+                                                              size_t __b) {
+  return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) {
+  return (vector float)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_len(signed __int128 *__a, size_t __b) {
+  return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_len(unsigned __int128 *__a, size_t __b) {
+  return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_len(signed long long *__a, size_t __b) {
+  return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_len(unsigned long long *__a, size_t __b) {
+  return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
+                                                        size_t __b) {
+  return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len_r(unsigned char *__a,
+                                                          size_t __b) {
+  vector unsigned char __res =
+      (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __mask =
+      (vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
+  __res = (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__res, (vector int)__res, __mask);
+#endif
+  return __res;
+}
+
+// vec_xst_len
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned char __a,
+                                                unsigned char *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed char __a,
+                                                signed char *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed short __a,
+                                                signed short *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned short __a,
+                                                unsigned short *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed int __a,
+                                                signed int *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned int __a,
+                                                unsigned int *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector float __a, float *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed __int128 __a,
+                                                signed __int128 *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned __int128 __a,
+                                                unsigned __int128 *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed long long __a,
+                                                signed long long *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned long long __a,
+                                                unsigned long long *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
+                                                  unsigned char *__b,
+                                                  size_t __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __mask =
+      (vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
+  vector unsigned char __res =
+      __builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
+  return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
+#else
+  return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56));
+#endif
+}
+#endif
+#endif
+
 /* vec_cpsgn */
 
 #ifdef __VSX__
@@ -1934,102 +2887,358 @@
 
 /* vec_ctf */
 
-static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
-  return __builtin_altivec_vcfsx(__a, __b);
-}
-
-static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
-                                                    int __b) {
-  return __builtin_altivec_vcfux((vector int)__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector double __ATTRS_o_ai
-vec_ctf(vector unsigned long long __a, int __b) {
-  vector double __ret = __builtin_convertvector(__a, vector double);
-  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
-  return __ret;
-}
-
-static __inline__ vector double __ATTRS_o_ai
-vec_ctf(vector signed long long __a, int __b) {
-  vector double __ret = __builtin_convertvector(__a, vector double);
-  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
-  return __ret;
-}
+#define vec_ctf(__a, __b)                                                      \
+  _Generic((__a), vector int                                                   \
+           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+             vector unsigned int                                               \
+           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)),  \
+             vector unsigned long long                                         \
+           : (__builtin_convertvector((vector unsigned long long)(__a),        \
+                                      vector double) *                         \
+              (vector double)(vector unsigned long long)((0x3ffULL - (__b))    \
+                                                         << 52)),              \
+             vector signed long long                                           \
+           : (__builtin_convertvector((vector signed long long)(__a),          \
+                                      vector double) *                         \
+              (vector double)(vector unsigned long long)((0x3ffULL - (__b))    \
+                                                         << 52)))
+#else
+#define vec_ctf(__a, __b)                                                      \
+  _Generic((__a), vector int                                                   \
+           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+             vector unsigned int                                               \
+           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)))
 #endif
 
 /* vec_vcfsx */
 
-static __inline__ vector float __attribute__((__always_inline__))
-vec_vcfsx(vector int __a, int __b) {
-  return __builtin_altivec_vcfsx(__a, __b);
-}
+#define vec_vcfux __builtin_altivec_vcfux
 
 /* vec_vcfux */
 
-static __inline__ vector float __attribute__((__always_inline__))
-vec_vcfux(vector unsigned int __a, int __b) {
-  return __builtin_altivec_vcfux((vector int)__a, __b);
-}
+#define vec_vcfsx(__a, __b) __builtin_altivec_vcfsx((vector int)(__a), (__b))
 
 /* vec_cts */
 
-static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
-  return __builtin_altivec_vctsxs(__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector signed long long __ATTRS_o_ai
-vec_cts(vector double __a, int __b) {
-  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
-  return __builtin_convertvector(__a, vector signed long long);
-}
+#define vec_cts(__a, __b)                                                      \
+  _Generic((__a), vector float                                                 \
+           : __builtin_altivec_vctsxs((__a), (__b)), vector double             \
+           : __extension__({                                                   \
+             vector double __ret =                                             \
+                 (__a) *                                                       \
+                 (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
+                                                            << 52);            \
+             __builtin_convertvector(__ret, vector signed long long);          \
+           }))
+#else
+#define vec_cts __builtin_altivec_vctsxs
 #endif
 
 /* vec_vctsxs */
 
-static __inline__ vector int __attribute__((__always_inline__))
-vec_vctsxs(vector float __a, int __b) {
-  return __builtin_altivec_vctsxs(__a, __b);
-}
+#define vec_vctsxs __builtin_altivec_vctsxs
 
 /* vec_ctu */
 
-static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
-                                                           int __b) {
-  return __builtin_altivec_vctuxs(__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector unsigned long long __ATTRS_o_ai
-vec_ctu(vector double __a, int __b) {
-  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
-  return __builtin_convertvector(__a, vector unsigned long long);
-}
+#define vec_ctu(__a, __b)                                                      \
+  _Generic((__a), vector float                                                 \
+           : __builtin_altivec_vctuxs((__a), (__b)), vector double             \
+           : __extension__({                                                   \
+             vector double __ret =                                             \
+                 (__a) *                                                       \
+                 (vector double)(vector unsigned long long)((0x3ffULL + __b)   \
+                                                            << 52);            \
+             __builtin_convertvector(__ret, vector unsigned long long);        \
+           }))
+#else
+#define vec_ctu __builtin_altivec_vctuxs
 #endif
 
 /* vec_vctuxs */
 
-static __inline__ vector unsigned int __attribute__((__always_inline__))
-vec_vctuxs(vector float __a, int __b) {
-  return __builtin_altivec_vctuxs(__a, __b);
+#define vec_vctuxs __builtin_altivec_vctuxs
+
+/* vec_signed */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sld(vector signed int, vector signed int, unsigned const int __c);
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signed(vector float __a) {
+  return __builtin_convertvector(__a, vector signed int);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_signed(vector double __a) {
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_signed2(vector double __a, vector double __b) {
+  return (vector signed int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpsxws(__a);
+#endif
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpsxws(__a);
+#else
+  vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_unsigned */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sld(vector unsigned int, vector unsigned int, unsigned const int __c);
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsigned(vector float __a) {
+  return __builtin_convertvector(__a, vector unsigned int);
+}
+
+#ifdef __VSX__
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_unsigned(vector double __a) {
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_unsigned2(vector double __a, vector double __b) {
+  return (vector unsigned int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpuxws(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpuxws(__a);
+#else
+  vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_float */
+
+static __inline__ vector float __ATTRS_o_ai
+vec_sld(vector float, vector float, unsigned const int __c);
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector signed int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector unsigned int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector signed long long __a, vector signed long long __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector double __a, vector double __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvsxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvuxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxdsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxdsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
 /* vec_double */
 
 #ifdef __VSX__
 static __inline__ vector double __ATTRS_o_ai
 vec_double(vector signed long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector unsigned long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvsxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvuxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvspdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector signed int __a) {
   vector double __ret = {__a[0], __a[1]};
   return __ret;
 }
 
 static __inline__ vector double __ATTRS_o_ai
-vec_double(vector unsigned long long __a) {
+vec_doubleh(vector unsigned int __a) {
   vector double __ret = {__a[0], __a[1]};
   return __ret;
 }
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector float __a) {
+  vector double __ret = {__a[0], __a[1]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector signed int __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector unsigned int __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector float __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxwdp(__a);
+#else
+  return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxwdp(__a);
+#else
+  return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvspdp(__a);
+#else
+  return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#endif
+}
 #endif
 
 /* vec_div */
@@ -3835,6 +5044,34 @@
                                          0x18, 0x19, 0x1A, 0x1B));
 }
 
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergee(vector bool long long __a, vector bool long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergee(vector signed long long __a, vector signed long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergee(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergee(vector float __a, vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergee(vector double __a, vector double __b) {
+  return vec_mergeh(__a, __b);
+}
+
 /* vec_mergeo */
 
 static __inline__ vector bool int __ATTRS_o_ai vec_mergeo(vector bool int __a,
@@ -3861,6 +5098,34 @@
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergeo(vector bool long long __a, vector bool long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeo(vector signed long long __a, vector signed long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeo(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergeo(vector float __a, vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeo(vector double __a, vector double __b) {
+  return vec_mergel(__a, __b);
+}
+
 #endif
 
 /* vec_mfvscr */
@@ -4689,6 +5954,12 @@
   return ~(__a & __b);
 }
 
+static __inline__ vector float __ATTRS_o_ai
+vec_nand(vector float __a, vector float __b) {
+  return (vector float)(~((vector unsigned int)__a &
+                          (vector unsigned int)__b));
+}
+
 static __inline__ vector signed long long __ATTRS_o_ai
 vec_nand(vector signed long long __a, vector signed long long __b) {
   return ~(__a & __b);
@@ -4724,6 +5995,12 @@
   return ~(__a & __b);
 }
 
+static __inline__ vector double __ATTRS_o_ai
+vec_nand(vector double __a, vector double __b) {
+  return (vector double)(~((vector unsigned long long)__a &
+                           (vector unsigned long long)__b));
+}
+
 #endif
 
 /* vec_nmadd */
@@ -5195,6 +6472,16 @@
   return __a | ~__b;
 }
 
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector bool int __a, vector float __b) {
+ return (vector float)(__a | ~(vector unsigned int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector float __a, vector bool int __b) {
+  return (vector float)((vector unsigned int)__a | ~__b);
+}
+
 static __inline__ vector signed long long __ATTRS_o_ai
 vec_orc(vector signed long long __a, vector signed long long __b) {
   return __a | ~__b;
@@ -5229,6 +6516,16 @@
 vec_orc(vector bool long long __a, vector bool long long __b) {
   return __a | ~__b;
 }
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a | ~__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector double __b) {
+  return (vector double)(__a | ~(vector unsigned long long)__b);
+}
 #endif
 
 /* vec_vor */
@@ -5536,8 +6833,25 @@
 #endif
 }
 
+static __inline__ vector float __ATTRS_o_ai
+vec_pack(vector double __a, vector double __b) {
+  return (vector float) (__a[0], __a[1], __b[0], __b[1]);
+}
 #endif
 
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_pack_to_short_fp32(vector float __a, vector float __b) {
+  vector float __resa = __builtin_vsx_xvcvsphp(__a);
+  vector float __resb = __builtin_vsx_xvcvsphp(__b);
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_mergee(__resa, __resb);
+#else
+  return (vector unsigned short)vec_mergeo(__resa, __resb);
+#endif
+}
+
+#endif
 /* vec_vpkuhum */
 
 #define __builtin_altivec_vpkuhum vec_vpkuhum
@@ -6324,6 +7638,36 @@
 }
 #endif
 
+/* vec_rlmi */
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlmi(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vrlwmi(__a, __c, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlmi(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned long long __c) {
+  return __builtin_altivec_vrldmi(__a, __c, __b);
+}
+
+/* vec_rlnm */
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlnm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int OneByte = { 0x8, 0x8, 0x8, 0x8 };
+  return __builtin_altivec_vrlwnm(__a, ((__c << OneByte) | __b));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlnm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned long long __c) {
+  vector unsigned long long OneByte = { 0x8, 0x8 };
+  return __builtin_altivec_vrldnm(__a, ((__c << OneByte) | __b));
+}
+#endif
+
 /* vec_vrlb */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -6693,45 +8037,51 @@
 
 /* vec_sl */
 
-static __inline__ vector signed char __ATTRS_o_ai
-vec_sl(vector signed char __a, vector unsigned char __b) {
-  return __a << (vector signed char)__b;
-}
-
+// vec_sl does modulo arithmetic on __b first, so __b is allowed to be more
+// than the length of __a.
 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_sl(vector unsigned char __a, vector unsigned char __b) {
-  return __a << __b;
+  return __a << (__b %
+                 (vector unsigned char)(sizeof(unsigned char) * __CHAR_BIT__));
 }
 
-static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
-                                                   vector unsigned short __b) {
-  return __a << (vector short)__b;
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)vec_sl((vector unsigned char)__a, __b);
 }
 
 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_sl(vector unsigned short __a, vector unsigned short __b) {
-  return __a << __b;
+  return __a << (__b % (vector unsigned short)(sizeof(unsigned short) *
+                                               __CHAR_BIT__));
 }
 
-static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
-                                                 vector unsigned int __b) {
-  return __a << (vector int)__b;
+static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
+                                                   vector unsigned short __b) {
+  return (vector short)vec_sl((vector unsigned short)__a, __b);
 }
 
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_sl(vector unsigned int __a, vector unsigned int __b) {
-  return __a << __b;
+  return __a << (__b %
+                 (vector unsigned int)(sizeof(unsigned int) * __CHAR_BIT__));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
+                                                 vector unsigned int __b) {
+  return (vector int)vec_sl((vector unsigned int)__a, __b);
 }
 
 #ifdef __POWER8_VECTOR__
-static __inline__ vector signed long long __ATTRS_o_ai
-vec_sl(vector signed long long __a, vector unsigned long long __b) {
-  return __a << (vector long long)__b;
-}
-
 static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
-  return __a << __b;
+  return __a << (__b % (vector unsigned long long)(sizeof(unsigned long long) *
+                                                   __CHAR_BIT__));
+}
+
+static __inline__ vector long long __ATTRS_o_ai
+vec_sl(vector long long __a, vector unsigned long long __b) {
+  return (vector long long)vec_sl((vector unsigned long long)__a, __b);
 }
 #endif
 
@@ -6984,6 +8334,145 @@
 #endif
 }
 
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_sld(vector bool long long __a, vector bool long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sld(vector signed long long __a, vector signed long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sld(vector unsigned long long __a, vector unsigned long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sld(vector double __a,
+                                                     vector double __b,
+                                                     unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+#endif
+
+/* vec_sldw */
+static __inline__ vector signed char __ATTRS_o_ai vec_sldw(
+    vector signed char __a, vector signed char __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sldw(vector unsigned char __a, vector unsigned char __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_sldw(
+    vector signed short __a, vector signed short __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sldw(vector unsigned short __a, vector unsigned short __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sldw(vector signed int __a, vector signed int __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sldw(
+    vector unsigned int __a, vector unsigned int __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sldw(vector signed long long __a, vector signed long long __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+/* vec_slv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slv(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vslv(__a, __b);
+}
+
+/* vec_srv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srv(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsrv(__a, __b);
+}
+#endif
+
 /* vec_vsldoi */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -7307,6 +8796,20 @@
                                                 (vector int)__b);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsl((vector int)__a,
+                                                        (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsl((vector int)__a,
+                                                          (vector int)__b);
+}
+#endif
+
 /* vec_vsl */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -7570,6 +9073,32 @@
   return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector signed char __b) {
+  return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector signed char __b) {
+  return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+                                                           (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+                                                           (vector int)__b);
+}
+#endif
+
 /* vec_vslo */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -8304,6 +9833,20 @@
                                                 (vector int)__b);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsr((vector int)__a,
+                                                        (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsr((vector int)__a,
+                                                          (vector int)__b);
+}
+#endif
+
 /* vec_vsr */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -8567,6 +10110,32 @@
   return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
 }
 
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector signed char __b) {
+  return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector signed char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+                                                           (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+                                                           (vector int)__b);
+}
+#endif
+
 /* vec_vsro */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -9580,6 +11149,12 @@
 
 /* vec_subc */
 
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subc(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vsubcuw((vector unsigned int)__a,
+                                                      (vector unsigned int) __b);
+}
+
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_subc(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_altivec_vsubcuw(__a, __b);
@@ -9813,6 +11388,7 @@
 
 /* vec_vsubeuqm */
 
+
 static __inline__ vector signed __int128 __ATTRS_o_ai
 vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
              vector signed __int128 __c) {
@@ -9825,6 +11401,18 @@
   return __builtin_altivec_vsubeuqm(__a, __b, __c);
 }
 
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sube(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sube(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
 /* vec_vsubcuq */
 
 static __inline__ vector signed __int128 __ATTRS_o_ai
@@ -9850,8 +11438,47 @@
              vector unsigned __int128 __c) {
   return __builtin_altivec_vsubecuq(__a, __b, __c);
 }
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subec(vector signed int __a, vector signed int __b,
+             vector signed int __c) {
+  return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subec(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c) {
+  return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_subec(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_subec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
 #endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sube(vector signed int __a, vector signed int __b,
+         vector signed int __c) {
+  vector signed int __mask = {1, 1, 1, 1};
+  vector signed int __carry = __c & __mask;
+  return vec_adde(__a, ~__b, __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sube(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int __mask = {1, 1, 1, 1};
+  vector unsigned int __carry = __c & __mask;
+  return vec_adde(__a, ~__b, __carry);
+}
 /* vec_sum4s */
 
 static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
@@ -10051,6 +11678,11 @@
   return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
 #endif
 }
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackh(vector float __a) {
+  return (vector double)(__a[0], __a[1]);
+}
 #endif
 
 /* vec_vupkhsb */
@@ -10185,6 +11817,11 @@
   return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
 #endif
 }
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackl(vector float __a) {
+  return (vector double)(__a[2], __a[3]);
+}
 #endif
 
 /* vec_vupklsb */
@@ -10511,6 +12148,11 @@
 
 #endif
 
+#ifdef __VSX__
+#define vec_xxpermdi __builtin_vsx_xxpermdi
+#define vec_xxsldwi __builtin_vsx_xxsldwi
+#endif
+
 /* vec_xor */
 
 #define __builtin_altivec_vxor vec_xor
@@ -10935,6 +12577,58 @@
   return __a[__b];
 }
 
+#ifdef __POWER9_VECTOR__
+
+#define vec_insert4b __builtin_vsx_insertword
+#define vec_extract4b __builtin_vsx_extractuword
+
+/* vec_extract_exp */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_exp(vector float __a) {
+  return __builtin_vsx_xvxexpsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_exp(vector double __a) {
+  return __builtin_vsx_xvxexpdp(__a);
+}
+
+/* vec_extract_sig */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_sig(vector float __a) {
+  return __builtin_vsx_xvxsigsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_sig (vector double __a) {
+  return __builtin_vsx_xvxsigdp(__a);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shorth(vector unsigned short __a) {
+  vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+            __builtin_shufflevector(__a, __a, 0, -1, 1, -1, 2, -1, 3, -1);
+#else
+            __builtin_shufflevector(__a, __a, -1, 0, -1, 1, -1, 2, -1, 3);
+#endif
+  return __builtin_vsx_xvcvhpsp(__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shortl(vector unsigned short __a) {
+  vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+            __builtin_shufflevector(__a, __a, 4, -1, 5, -1, 6, -1, 7, -1);
+#else
+            __builtin_shufflevector(__a, __a, -1, 4, -1, 5, -1, 6, -1, 7);
+#endif
+  return __builtin_vsx_xvcvhpsp(__b);
+}
+#endif /* __POWER9_VECTOR__ */
+
 /* vec_insert */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -14369,6 +16063,24 @@
 #endif
 
 #ifdef __POWER8_VECTOR__
+static __inline__ vector bool char __ATTRS_o_ai
+vec_permxor(vector bool char __a, vector bool char __b,
+            vector bool char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_permxor(vector signed char __a, vector signed char __b,
+            vector signed char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_permxor(vector unsigned char __a, vector unsigned char __b,
+            vector unsigned char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
 static __inline__ vector unsigned char __ATTRS_o_ai
 __builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b,
                           vector unsigned char __c) {
@@ -14453,6 +16165,572 @@
 #endif
 #endif
 
+
+/* vec_reve */
+
+static inline __ATTRS_o_ai vector bool char vec_reve(vector bool char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed char vec_reve(vector signed char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_reve(vector unsigned char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool int vec_reve(vector bool int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_reve(vector signed int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_reve(vector unsigned int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool short vec_reve(vector bool short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_reve(vector signed short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_reve(vector unsigned short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector float vec_reve(vector float __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector bool long long
+vec_reve(vector bool long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_reve(vector signed long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_reve(vector unsigned long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector double vec_reve(vector double __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+#endif
+
+/* vec_revb */
+static __inline__ vector bool char __ATTRS_o_ai
+vec_revb(vector bool char __a) {
+  return __a;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_revb(vector signed char __a) {
+  return __a;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_revb(vector unsigned char __a) {
+  return __a;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_revb(vector bool short __a) {
+  vector unsigned char __indices =
+      { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_revb(vector signed short __a) {
+  vector unsigned char __indices =
+      { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_revb(vector unsigned short __a) {
+  vector unsigned char __indices =
+     { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_revb(vector bool int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_revb(vector signed int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_revb(vector unsigned int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_revb(vector float __a) {
+ vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return vec_perm(__a, __a, __indices);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_revb(vector bool long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_revb(vector signed long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_revb(vector unsigned long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_revb(vector double __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+#endif /* End __VSX__ */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_revb(vector signed __int128 __a) {
+  vector unsigned char __indices =
+      { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  return (vector signed __int128)vec_perm((vector signed int)__a,
+                                          (vector signed int)__a,
+                                           __indices);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_revb(vector unsigned __int128 __a) {
+  vector unsigned char __indices =
+      { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  return (vector unsigned __int128)vec_perm((vector signed int)__a,
+                                            (vector signed int)__a,
+                                             __indices);
+}
+#endif /* END __POWER8_VECTOR__ && __powerpc64__ */
+
+/* vec_xl */
+
+static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset,
+                                                     signed char *__ptr) {
+  return *(vector signed char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xl(signed long long __offset, unsigned char *__ptr) {
+  return *(vector unsigned char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
+                                                      signed short *__ptr) {
+  return *(vector signed short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xl(signed long long __offset, unsigned short *__ptr) {
+  return *(vector unsigned short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
+                                                    signed int *__ptr) {
+  return *(vector signed int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
+                                                      unsigned int *__ptr) {
+  return *(vector unsigned int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
+                                               float *__ptr) {
+  return *(vector float *)(__ptr + __offset);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector signed long long
+vec_xl(signed long long __offset, signed long long *__ptr) {
+  return *(vector signed long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xl(signed long long __offset, unsigned long long *__ptr) {
+  return *(vector unsigned long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
+                                                double *__ptr) {
+  return *(vector double *)(__ptr + __offset);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai vector signed __int128
+vec_xl(signed long long __offset, signed __int128 *__ptr) {
+  return *(vector signed __int128 *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned __int128
+vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
+  return *(vector unsigned __int128 *)(__ptr + __offset);
+}
+#endif
+
+/* vec_xl_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed char *__ptr) {
+  vector signed char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                                 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned char *__ptr) {
+  vector unsigned char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                                 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector signed short  __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed short *__ptr) {
+  vector signed short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned short *__ptr) {
+  vector unsigned short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed int *__ptr) {
+  return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned int *__ptr) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, float *__ptr) {
+  return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed long long *__ptr) {
+  return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned long long *__ptr) {
+  return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, double *__ptr) {
+  return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed __int128 *__ptr) {
+  return vec_xl(__offset, __ptr);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned __int128 *__ptr) {
+  return vec_xl(__offset, __ptr);
+}
+#endif
+#else
+  #define vec_xl_be vec_xl
+#endif
+
+/* vec_xst */
+
+static inline __ATTRS_o_ai void vec_xst(vector signed char __vec,
+                                        signed long long __offset,
+                                        signed char *__ptr) {
+  *(vector signed char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned char __vec,
+                                        signed long long __offset,
+                                        unsigned char *__ptr) {
+  *(vector unsigned char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed short __vec,
+                                        signed long long __offset,
+                                        signed short *__ptr) {
+  *(vector signed short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec,
+                                        signed long long __offset,
+                                        unsigned short *__ptr) {
+  *(vector unsigned short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed int __vec,
+                                        signed long long __offset,
+                                        signed int *__ptr) {
+  *(vector signed int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned int __vec,
+                                        signed long long __offset,
+                                        unsigned int *__ptr) {
+  *(vector unsigned int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector float __vec,
+                                        signed long long __offset,
+                                        float *__ptr) {
+  *(vector float *)(__ptr + __offset) = __vec;
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec,
+                                        signed long long __offset,
+                                        signed long long *__ptr) {
+  *(vector signed long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec,
+                                        signed long long __offset,
+                                        unsigned long long *__ptr) {
+  *(vector unsigned long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector double __vec,
+                                        signed long long __offset,
+                                        double *__ptr) {
+  *(vector double *)(__ptr + __offset) = __vec;
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec,
+                                        signed long long __offset,
+                                        signed __int128 *__ptr) {
+  *(vector signed __int128 *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec,
+                                        signed long long __offset,
+                                        unsigned __int128 *__ptr) {
+  *(vector unsigned __int128 *)(__ptr + __offset) = __vec;
+}
+#endif
+
+/* vec_xst_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed char __vec,
+                                               signed long long  __offset,
+                                               signed char *__ptr) {
+  vector signed char __tmp =
+     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                             13, 12, 11, 10, 9, 8);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
+                                               signed long long  __offset,
+                                               unsigned char *__ptr) {
+  vector unsigned char __tmp =
+     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                             13, 12, 11, 10, 9, 8);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
+                                               signed long long  __offset,
+                                               signed short *__ptr) {
+  vector signed short __tmp =
+     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
+                                               signed long long  __offset,
+                                               unsigned short *__ptr) {
+  vector unsigned short __tmp =
+     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
+                                               signed long long  __offset,
+                                               signed int *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned int __vec,
+                                               signed long long  __offset,
+                                               unsigned int *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector float __vec,
+                                               signed long long  __offset,
+                                               float *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed long long __vec,
+                                               signed long long  __offset,
+                                               signed long long *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned long long __vec,
+                                               signed long long  __offset,
+                                               unsigned long long *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector double __vec,
+                                               signed long long  __offset,
+                                               double *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed __int128 __vec,
+                                               signed long long  __offset,
+                                               signed __int128 *__ptr) {
+  vec_xst(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned __int128 __vec,
+                                               signed long long  __offset,
+                                               unsigned __int128 *__ptr) {
+  vec_xst(__vec, __offset, __ptr);
+}
+#endif
+#else
+  #define vec_xst_be vec_xst
+#endif
+
+#ifdef __POWER9_VECTOR__
+#define vec_test_data_class(__a, __b)                                      \
+        _Generic((__a),                                                    \
+           vector float:                                                   \
+             (vector bool int)__builtin_vsx_xvtstdcsp((__a), (__b)),       \
+           vector double:                                                  \
+             (vector bool long long)__builtin_vsx_xvtstdcdp((__a), (__b))  \
+        )
+
+#endif /* #ifdef __POWER9_VECTOR__ */
+
+static vector float __ATTRS_o_ai vec_neg(vector float __a) {
+  return -__a;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_neg(vector double __a) {
+  return -__a;
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_neg(vector long long __a) {
+  return -__a;
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_neg(vector signed int __a) {
+  return -__a;
+}
+
+static vector signed short __ATTRS_o_ai vec_neg(vector signed short __a) {
+  return -__a;
+}
+
+static vector signed char __ATTRS_o_ai vec_neg(vector signed char __a) {
+  return -__a;
+}
+
+static vector float __ATTRS_o_ai vec_nabs(vector float __a) {
+  return - vec_abs(__a);
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_nabs(vector double __a) {
+  return - vec_abs(__a);
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_nabs(vector long long __a) {
+  return __builtin_altivec_vminsd(__a, -__a);
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_nabs(vector signed int __a) {
+  return __builtin_altivec_vminsw(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai vec_nabs(vector signed short __a) {
+  return __builtin_altivec_vminsh(__a, -__a);
+}
+
+static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
+  return __builtin_altivec_vminsb(__a, -__a);
+}
 #undef __ATTRS_o_ai
 
 #endif /* __ALTIVEC_H */
diff --git a/darwin-x86/clang-headers/ammintrin.h b/darwin-x86/clang-headers/ammintrin.h
index 8985bb4..680b446 100644
--- a/darwin-x86/clang-headers/ammintrin.h
+++ b/darwin-x86/clang-headers/ammintrin.h
@@ -27,10 +27,10 @@
 #include <pmmintrin.h>
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
 
-/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
-///    integer vector operand at the index idx and of the length len.
+/// Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index \a idx and of the length \a len.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -38,7 +38,7 @@
 /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
 ///
 /// \param x
 ///    The value from which bits are extracted.
@@ -49,20 +49,21 @@
 ///    Bits [5:0] specify the index of the least significant bit; the other
 ///    bits are ignored. If the sum of the index and length is greater than 64,
 ///    the result is undefined. If the length and index are both zero, bits
-///    [63:0] of parameter x are extracted. If the length is zero but the index
-///    is non-zero, the result is undefined.
+///    [63:0] of parameter \a x are extracted. If the length is zero but the
+///    index is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector whose lower 64 bits contain the bits
 ///    extracted from the source operand.
 #define _mm_extracti_si64(x, len, idx) \
   ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
                                   (char)(len), (char)(idx)))
 
-/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
-///    integer vector operand at the index and of the length specified by __y.
+/// Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index and of the length specified by
+///    \a __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
 ///
 /// \param __x
 ///    The value from which bits are extracted.
@@ -71,8 +72,8 @@
 ///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
 ///    length is interpreted as 64. If the sum of the index and length is
 ///    greater than 64, the result is undefined. If the length and index are
-///    both zero, bits [63:0] of parameter __x are extracted. If the length is
-///    zero but the index is non-zero, the result is undefined.
+///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
+///    is zero but the index is non-zero, the result is undefined.
 /// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
 ///    from the source operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -81,9 +82,9 @@
   return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
 }
 
-/// \brief Inserts bits of a specified length from the source integer vector y
-///    into the lower 64 bits of the destination integer vector x at the index
-///    idx and of the length len.
+/// Inserts bits of a specified length from the source integer vector
+///    \a y into the lower 64 bits of the destination integer vector \a x at
+///    the index \a idx and of the length \a len.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -92,15 +93,15 @@
 /// const int idx);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
 ///
 /// \param x
 ///    The destination operand where bits will be inserted. The inserted bits
-///    are defined by the length len and by the index idx specifying the least
-///    significant bit.
+///    are defined by the length \a len and by the index \a idx specifying the
+///    least significant bit.
 /// \param y
 ///    The source operand containing the bits to be extracted. The extracted
-///    bits are the least significant bits of operand y of length len.
+///    bits are the least significant bits of operand \a y of length \a len.
 /// \param len
 ///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
 ///    are zero, the length is interpreted as 64.
@@ -108,58 +109,56 @@
 ///    Bits [5:0] specify the index of the least significant bit; the other
 ///    bits are ignored. If the sum of the index and length is greater than 64,
 ///    the result is undefined. If the length and index are both zero, bits
-///    [63:0] of parameter y are inserted into parameter x. If the length is
-///    zero but the index is non-zero, the result is undefined.
+///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
+///    is zero but the index is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector containing the original lower 64-bits of
-///    destination operand x with the specified bitfields replaced by the lower
-///    bits of source operand y. The upper 64 bits of the return value are
-///    undefined.
-
+///    destination operand \a x with the specified bitfields replaced by the
+///    lower bits of source operand \a y. The upper 64 bits of the return value
+///    are undefined.
 #define _mm_inserti_si64(x, y, len, idx) \
   ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
                                     (__v2di)(__m128i)(y), \
                                     (char)(len), (char)(idx)))
 
-/// \brief Inserts bits of a specified length from the source integer vector
-///    __y into the lower 64 bits of the destination integer vector __x at the
-///    index and of the length specified by __y.
+/// Inserts bits of a specified length from the source integer vector
+///    \a __y into the lower 64 bits of the destination integer vector \a __x
+///    at the index and of the length specified by \a __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
 ///
 /// \param __x
 ///    The destination operand where bits will be inserted. The inserted bits
 ///    are defined by the length and by the index of the least significant bit
-///    specified by operand __y.
+///    specified by operand \a __y.
 /// \param __y
 ///    The source operand containing the bits to be extracted. The extracted
-///    bits are the least significant bits of operand __y with length specified
-///    by bits [69:64]. These are inserted into the destination at the index
-///    specified by bits [77:72]; all other bits are ignored. If bits [69:64]
-///    are zero, the length is interpreted as 64. If the sum of the index and
-///    length is greater than 64, the result is undefined. If the length and
-///    index are both zero, bits [63:0] of parameter __y are inserted into
-///    parameter __x. If the length is zero but the index is non-zero, the
-///    result is undefined.
+///    bits are the least significant bits of operand \a __y with length
+///    specified by bits [69:64]. These are inserted into the destination at the
+///    index specified by bits [77:72]; all other bits are ignored. If bits
+///    [69:64] are zero, the length is interpreted as 64. If the sum of the
+///    index and length is greater than 64, the result is undefined. If the
+///    length and index are both zero, bits [63:0] of parameter \a __y are
+///    inserted into parameter \a __x. If the length is zero but the index is
+///    non-zero, the result is undefined.
 /// \returns A 128-bit integer vector containing the original lower 64-bits of
-///    destination operand __x with the specified bitfields replaced by the
-///    lower bits of source operand __y. The upper 64 bits of the return value
-///    are undefined.
-
+///    destination operand \a __x with the specified bitfields replaced by the
+///    lower bits of source operand \a __y. The upper 64 bits of the return
+///    value are undefined.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_si64(__m128i __x, __m128i __y)
 {
   return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
 }
 
-/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
+/// Stores a 64-bit double-precision value in a 64-bit memory location.
 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MOVNTSD instruction.
+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
 ///
 /// \param __p
 ///    The 64-bit memory location used to store the register value.
@@ -171,13 +170,13 @@
   __builtin_ia32_movntsd(__p, (__v2df)__a);
 }
 
-/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
+/// Stores a 32-bit single-precision floating-point value in a 32-bit
 ///    memory location. To minimize caching, the data is flagged as
 ///    non-temporal (unlikely to be used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MOVNTSS instruction.
+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
 ///
 /// \param __p
 ///    The 32-bit memory location used to store the register value.
diff --git a/darwin-x86/clang-headers/arm64intr.h b/darwin-x86/clang-headers/arm64intr.h
new file mode 100644
index 0000000..be52283
--- /dev/null
+++ b/darwin-x86/clang-headers/arm64intr.h
@@ -0,0 +1,49 @@
+/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <arm64intr.h>
+#else
+
+#ifndef __ARM64INTR_H
+#define __ARM64INTR_H
+
+typedef enum
+{
+  _ARM64_BARRIER_SY    = 0xF,
+  _ARM64_BARRIER_ST    = 0xE,
+  _ARM64_BARRIER_LD    = 0xD,
+  _ARM64_BARRIER_ISH   = 0xB,
+  _ARM64_BARRIER_ISHST = 0xA,
+  _ARM64_BARRIER_ISHLD = 0x9,
+  _ARM64_BARRIER_NSH   = 0x7,
+  _ARM64_BARRIER_NSHST = 0x6,
+  _ARM64_BARRIER_NSHLD = 0x5,
+  _ARM64_BARRIER_OSH   = 0x3,
+  _ARM64_BARRIER_OSHST = 0x2,
+  _ARM64_BARRIER_OSHLD = 0x1
+} _ARM64INTR_BARRIER_TYPE;
+
+#endif /* __ARM64INTR_H */
+#endif /* _MSC_VER */
diff --git a/darwin-x86/clang-headers/arm_acle.h b/darwin-x86/clang-headers/arm_acle.h
index 8423e62..ab25897 100644
--- a/darwin-x86/clang-headers/arm_acle.h
+++ b/darwin-x86/clang-headers/arm_acle.h
@@ -225,19 +225,49 @@
 }
 
 /*
+ * 9.3 16-bit multiplications
+ */
+#if __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulbb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulbb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulbt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulbt(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smultb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smultb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smultt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smultt(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulwb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulwb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulwt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulwt(__a, __b);
+}
+#endif
+
+/*
  * 9.4 Saturating intrinsics
  *
  * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
  * intrinsics are implemented and the flag is enabled.
  */
 /* 9.4.1 Width-specified saturation intrinsics */
-#if __ARM_32BIT_STATE
+#if __ARM_FEATURE_SAT
 #define __ssat(x, y) __builtin_arm_ssat(x, y)
 #define __usat(x, y) __builtin_arm_usat(x, y)
 #endif
 
 /* 9.4.2 Saturating addition and subtraction intrinsics */
-#if __ARM_32BIT_STATE
+#if __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qadd(int32_t __t, int32_t __v) {
   return __builtin_arm_qadd(__t, __v);
@@ -254,6 +284,290 @@
 }
 #endif
 
+/* 9.4.3 Accumultating multiplications */
+#if __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlabb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlabb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlabt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlabt(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlatb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlatb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlatt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlatt(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlawb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlawb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlawt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlawt(__a, __b, __c);
+}
+#endif
+
+
+/* 9.5.4 Parallel 16-bit saturation */
+#if __ARM_FEATURE_SIMD32
+#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
+#define __usat16(x, y) __builtin_arm_usat16(x, y)
+#endif
+
+/* 9.5.5 Packing and unpacking */
+#if __ARM_FEATURE_SIMD32
+typedef int32_t int8x4_t;
+typedef int32_t int16x2_t;
+typedef uint32_t uint8x4_t;
+typedef uint32_t uint16x2_t;
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sxtab16(int16x2_t __a, int8x4_t __b) {
+  return __builtin_arm_sxtab16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sxtb16(int8x4_t __a) {
+  return __builtin_arm_sxtb16(__a);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__uxtab16(int16x2_t __a, int8x4_t __b) {
+  return __builtin_arm_uxtab16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__uxtb16(int8x4_t __a) {
+  return __builtin_arm_uxtb16(__a);
+}
+#endif
+
+/* 9.5.6 Parallel selection */
+#if __ARM_FEATURE_SIMD32
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__sel(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_sel(__a, __b);
+}
+#endif
+
+/* 9.5.7 Parallel 8-bit addition and subtraction */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__qadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_qadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__qsub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_qsub8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__sadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_sadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__shadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_shadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__shsub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_shsub8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__ssub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_ssub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uhadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uhadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uhsub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uhsub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uqadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uqadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uqsub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uqsub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__usub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_usub8(__a, __b);
+}
+#endif
+
+/* 9.5.8 Sum of 8-bit absolute differences */
+#if __ARM_FEATURE_SIMD32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__usad8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_usad8(__a, __b);
+}
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
+  return __builtin_arm_usada8(__a, __b, __c);
+}
+#endif
+
+/* 9.5.9 Parallel 16-bit addition and subtraction */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qsax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qsax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qsub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qsub16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_sadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_sasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shsax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shsax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shsub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shsub16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__ssax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_ssax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__ssub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_ssub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhsax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhsax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhsub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhsub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqsax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqsax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqsub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqsub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__usax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_usax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__usub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_usub16(__a, __b);
+}
+#endif
+
+/* 9.5.10 Parallel 16-bit multiplications */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlad(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smladx(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlald(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlaldx(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlsd(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlsdx(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlsld(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlsldx(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smuad(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smuad(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smuadx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smuadx(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smusd(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smusd(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smusdx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smusdx(__a, __b);
+}
+#endif
+
 /* 9.7 CRC32 intrinsics */
 #if __ARM_FEATURE_CRC32
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
diff --git a/darwin-x86/clang-headers/arm_fp16.h b/darwin-x86/clang-headers/arm_fp16.h
new file mode 100644
index 0000000..45ff14f
--- /dev/null
+++ b/darwin-x86/clang-headers/arm_fp16.h
@@ -0,0 +1,1499 @@
+/*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_FP16_H
+#define __ARM_FP16_H
+
+#include <stdint.h>
+
+typedef __fp16 float16_t;
+#define __ai static inline __attribute__((__always_inline__, __nodebug__))
+
+#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+#define vabdh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vabdh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vabsh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vabsh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vaddh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vaddh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcageh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vcageh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcagth_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vcagth_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcaleh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vcaleh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcalth_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vcalth_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vceqh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vceqh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vceqzh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vceqzh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcgeh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vcgeh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcgezh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcgezh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcgth_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vcgth_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcgtzh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcgtzh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcleh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vcleh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vclezh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vclezh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vclth_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vclth_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcltzh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcltzh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvth_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvth_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvth_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvth_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvth_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvth_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtah_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtah_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtah_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtah_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtah_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtah_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtah_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtah_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtah_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtah_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtah_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtah_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16_t vcvth_f16_u32(uint32_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u32(__p0);
+  return __ret;
+}
+#else
+__ai float16_t vcvth_f16_u32(uint32_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16_t vcvth_f16_u64(uint64_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u64(__p0);
+  return __ret;
+}
+#else
+__ai float16_t vcvth_f16_u64(uint64_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16_t vcvth_f16_u16(uint16_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u16(__p0);
+  return __ret;
+}
+#else
+__ai float16_t vcvth_f16_u16(uint16_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16_t vcvth_f16_s32(int32_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s32(__p0);
+  return __ret;
+}
+#else
+__ai float16_t vcvth_f16_s32(int32_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16_t vcvth_f16_s64(int64_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s64(__p0);
+  return __ret;
+}
+#else
+__ai float16_t vcvth_f16_s64(int64_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16_t vcvth_f16_s16(int16_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s16(__p0);
+  return __ret;
+}
+#else
+__ai float16_t vcvth_f16_s16(int16_t __p0) {
+  float16_t __ret;
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtmh_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtmh_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtmh_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtmh_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtmh_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtmh_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtmh_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtmh_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtmh_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtmh_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtmh_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtmh_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtnh_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtnh_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtnh_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtnh_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtnh_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtnh_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtnh_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtnh_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtnh_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtnh_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtnh_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtnh_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtph_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtph_s16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtph_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtph_s32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtph_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtph_s64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtph_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtph_u16_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtph_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtph_u32_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtph_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \
+  __ret; \
+})
+#else
+#define vcvtph_u64_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdivh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vdivh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \
+  __ret; \
+})
+#else
+#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \
+  __ret; \
+})
+#else
+#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmaxh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vmaxh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vminh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vminh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vminnmh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vminnmh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vmulh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vmulxh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vnegh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vnegh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrecpeh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrecpeh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrecpsh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vrecpsh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrecpxh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrecpxh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrndh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrndh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrndah_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrndah_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrndih_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrndih_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrndmh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrndmh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrndnh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrndnh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrndph_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrndph_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrndxh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrndxh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsqrteh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \
+  __ret; \
+})
+#else
+#define vrsqrteh_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsqrth_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \
+  __ret; \
+})
+#else
+#define vsqrth_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsubh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \
+  __ret; \
+})
+#else
+#define vsubh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#endif
+
+#undef __ai
+
+#endif /* __ARM_FP16_H */
diff --git a/darwin-x86/clang-headers/arm_neon.h b/darwin-x86/clang-headers/arm_neon.h
index f5ca59b..e0efa76 100644
--- a/darwin-x86/clang-headers/arm_neon.h
+++ b/darwin-x86/clang-headers/arm_neon.h
@@ -8587,6 +8587,1278 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x2(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x2(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x2(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x2(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x2(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x2(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x2(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x2(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x2(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x2(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x2(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x2(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x2(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x2(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x2(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x2(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x2(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x2(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x2(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x2(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x2(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x2(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x2(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x2(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x2(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x2(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x2(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x2(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x2(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x2(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x2(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x2(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x2(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x2(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x2(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x2(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x2(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x2(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x2(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x2(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x2(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x2(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x2(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x2(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x2(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x2(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x2(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x2(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x3(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x3(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x3(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x3(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x3(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x3(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x3(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x3(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x3(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x3(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x3(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x3(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x3(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x3(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x3(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x3(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x3(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x3(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x3(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x3(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x3(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x3(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x3(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x3(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x3(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x3(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x3(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x3(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x3(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x3(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x3(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x3(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x3(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x3(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x3(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x3(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x3(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x3(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x3(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x3(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x3(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x3(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x3(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x3(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x3(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x3(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x3(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x3(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x4(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x4(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x4(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x4(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x4(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x4(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x4(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x4(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x4(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x4(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x4(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x4(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x4(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x4(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x4(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x4(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x4(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x4(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x4(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x4(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x4(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x4(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x4(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x4(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x4(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x4(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x4(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x4(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x4(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x4(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x4(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x4(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x4(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x4(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x4(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x4(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x4(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x4(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x4(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x4(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x4(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x4(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x4(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x4(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x4(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x4(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x4(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x4(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 #define vld2_p8(__p0) __extension__ ({ \
   poly8x8x2_t __ret; \
   __builtin_neon_vld2_v(&__ret, __p0, 4); \
@@ -8989,6 +10261,210 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld2q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld2q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld2q_dup_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld2q_dup_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 #define vld2_dup_u8(__p0) __extension__ ({ \
   uint8x8x2_t __ret; \
   __builtin_neon_vld2_dup_v(&__ret, __p0, 16); \
@@ -9951,6 +11427,222 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld3q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld3q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld3q_dup_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld3q_dup_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 #define vld3_dup_u8(__p0) __extension__ ({ \
   uint8x8x3_t __ret; \
   __builtin_neon_vld3_dup_v(&__ret, __p0, 16); \
@@ -10977,6 +12669,234 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld4q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld4q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld4q_dup_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld4q_dup_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 #define vld4_dup_u8(__p0) __extension__ ({ \
   uint8x8x4_t __ret; \
   __builtin_neon_vld4_dup_v(&__ret, __p0, 16); \
@@ -25581,6 +27501,1134 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
+})
+#else
+#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \
+})
+#else
+#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \
+})
+#else
+#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \
+})
+#else
+#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \
+})
+#else
+#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \
+})
+#else
+#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \
+})
+#else
+#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \
+})
+#else
+#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \
+})
+#else
+#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 41); \
+})
+#else
+#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 40); \
+})
+#else
+#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 34); \
+})
+#else
+#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 35); \
+})
+#else
+#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 33); \
+})
+#else
+#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \
+})
+#else
+#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \
+})
+#else
+#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#else
+#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \
+})
+#else
+#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \
+})
+#else
+#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 9); \
+})
+#else
+#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 8); \
+})
+#else
+#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 2); \
+})
+#else
+#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#else
+#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 1); \
+})
+#else
+#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \
+})
+#else
+#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \
+})
+#else
+#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \
+})
+#else
+#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \
+})
+#else
+#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \
+})
+#else
+#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \
+})
+#else
+#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \
+})
+#else
+#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \
+})
+#else
+#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \
+})
+#else
+#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 41); \
+})
+#else
+#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 40); \
+})
+#else
+#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 34); \
+})
+#else
+#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 35); \
+})
+#else
+#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 33); \
+})
+#else
+#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \
+})
+#else
+#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \
+})
+#else
+#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#else
+#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \
+})
+#else
+#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \
+})
+#else
+#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 9); \
+})
+#else
+#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 8); \
+})
+#else
+#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 2); \
+})
+#else
+#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#else
+#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 1); \
+})
+#else
+#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \
+})
+#else
+#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \
+})
+#else
+#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \
+})
+#else
+#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \
+})
+#else
+#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \
+})
+#else
+#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \
+})
+#else
+#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \
+})
+#else
+#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \
+})
+#else
+#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \
+})
+#else
+#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 41); \
+})
+#else
+#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 40); \
+})
+#else
+#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 34); \
+})
+#else
+#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 35); \
+})
+#else
+#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 33); \
+})
+#else
+#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \
+})
+#else
+#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \
+})
+#else
+#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#else
+#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \
+})
+#else
+#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \
+})
+#else
+#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 9); \
+})
+#else
+#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 8); \
+})
+#else
+#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 2); \
+})
+#else
+#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#else
+#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 1); \
+})
+#else
+#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 #define vst2_p8(__p0, __p1) __extension__ ({ \
   poly8x8x2_t __s1 = __p1; \
   __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
@@ -29838,6 +32886,110 @@
 
 #if !defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vdupq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vdup_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmovq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vmovq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmov_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vmov_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 __ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
   poly8x8_t __ret;
   __ret = (poly8x8_t)(__p0);
@@ -33836,6 +36988,245 @@
 #endif
 
 #endif
+#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vsha1h_u32(uint32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vsha1h_u32(uint32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
 #if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
 #ifdef __LITTLE_ENDIAN__
 __ai float32x4_t vrndq_f32(float32x4_t __p0) {
@@ -33902,6 +37293,38 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndiq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndiq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndi_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndi_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 __ai float32x4_t vrndmq_f32(float32x4_t __p0) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 41);
@@ -33966,6 +37389,20 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+__ai float32_t vrndns_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrndns_f32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vrndns_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrndns_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 __ai float32x4_t vrndpq_f32(float32x4_t __p0) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 41);
@@ -34030,6 +37467,200 @@
 #endif
 
 #endif
+#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrndq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrndq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrnd_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrnd_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrndaq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrndaq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrnda_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrnda_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrndmq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrndmq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrndm_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrndm_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrndnq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrndnq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrndn_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrndn_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrndpq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrndpq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrndp_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrndp_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrndxq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrndxq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrndx_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrndx_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
 #if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)
 #ifdef __LITTLE_ENDIAN__
 __ai float32x4_t vmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
@@ -34100,6 +37731,76 @@
 #endif
 
 #endif
+#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
 #if __ARM_ARCH >= 8 && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
 __ai int64x2_t vcvtaq_s64_f64(float64x2_t __p0) {
@@ -39908,22 +43609,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x4_t vrndiq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai float32x4_t vrndiq_f32(float32x4_t __p0) {
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 __ai float64x1_t vrndi_f64(float64x1_t __p0) {
   float64x1_t __ret;
   __ret = (float64x1_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 10);
@@ -39938,22 +43623,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x2_t vrndi_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai float32x2_t vrndi_f32(float32x2_t __p0) {
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 __ai float64x2_t vrndmq_f64(float64x2_t __p0) {
   float64x2_t __ret;
   __ret = (float64x2_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 42);
@@ -40138,242 +43807,323 @@
 #endif
 
 #endif
-#if __ARM_FEATURE_CRYPTO
+#if defined(__ARM_FEATURE_DOTPROD)
 #ifdef __LITTLE_ENDIAN__
-__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+__ai uint32x4_t vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vdotq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
   return __ret;
 }
 #else
-__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+__ai uint32x4_t vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
   uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
-  return __ret;
-}
-#else
-__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = (uint32x4_t) __builtin_neon_vdotq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
   return __ret;
 }
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint32_t vsha1h_u32(uint32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
-  return __ret;
-}
-#else
-__ai uint32_t vsha1h_u32(uint32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
+__ai uint32x4_t __noswap_vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vdotq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
   return __ret;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+__ai int32x4_t vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vdotq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
   return __ret;
 }
 #else
-__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+__ai int32x4_t vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vdotq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
   return __ret;
 }
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
-  return __ret;
-}
-#else
-__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+__ai int32x4_t __noswap_vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vdotq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
   return __ret;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+__ai uint32x2_t vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vdot_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18);
   return __ret;
 }
 #else
-__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+__ai uint32x2_t vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vdot_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vdot_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18);
   return __ret;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+__ai int32x2_t vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vdot_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
   return __ret;
 }
 #else
-__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+__ai int32x2_t vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vdot_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vdot_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
   return __ret;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
+#define vdotq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+uint8x8_t __reint = __s2; \
+uint32x4_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3, __p3, __p3); \
+  __ret = vdotq_u32(__s0, __s1, *(uint8x16_t *) &__reint1); \
+  __ret; \
+})
 #else
-__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
+#define vdotq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+uint8x8_t __reint = __rev2; \
+uint32x4_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3, __p3, __p3); \
+  __ret = __noswap_vdotq_u32(__rev0, __rev1, *(uint8x16_t *) &__reint1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
+#define vdotq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+int8x8_t __reint = __s2; \
+int32x4_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3, __p3, __p3); \
+  __ret = vdotq_s32(__s0, __s1, *(int8x16_t *) &__reint1); \
+  __ret; \
+})
 #else
-__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
+#define vdotq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+int8x8_t __reint = __rev2; \
+int32x4_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3, __p3, __p3); \
+  __ret = __noswap_vdotq_s32(__rev0, __rev1, *(int8x16_t *) &__reint1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
+#define vdot_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __s2 = __p2; \
+  uint32x2_t __ret; \
+uint8x8_t __reint = __s2; \
+uint32x2_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3); \
+  __ret = vdot_u32(__s0, __s1, *(uint8x8_t *) &__reint1); \
+  __ret; \
+})
 #else
-__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
+#define vdot_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+uint8x8_t __reint = __rev2; \
+uint32x2_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3); \
+  __ret = __noswap_vdot_u32(__rev0, __rev1, *(uint8x8_t *) &__reint1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
+#define vdot_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __s2 = __p2; \
+  int32x2_t __ret; \
+int8x8_t __reint = __s2; \
+int32x2_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3); \
+  __ret = vdot_s32(__s0, __s1, *(int8x8_t *) &__reint1); \
+  __ret; \
+})
 #else
-__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
+#define vdot_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+int8x8_t __reint = __rev2; \
+int32x2_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3); \
+  __ret = __noswap_vdot_s32(__rev0, __rev1, *(int8x8_t *) &__reint1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#endif
+#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+#define vdotq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __s2 = __p2; \
+  uint32x4_t __ret; \
+uint8x16_t __reint = __s2; \
+uint32x4_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3, __p3, __p3); \
+  __ret = vdotq_u32(__s0, __s1, *(uint8x16_t *) &__reint1); \
+  __ret; \
+})
+#else
+#define vdotq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+uint8x16_t __reint = __rev2; \
+uint32x4_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3, __p3, __p3); \
+  __ret = __noswap_vdotq_u32(__rev0, __rev1, *(uint8x16_t *) &__reint1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdotq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __s2 = __p2; \
+  int32x4_t __ret; \
+int8x16_t __reint = __s2; \
+int32x4_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3, __p3, __p3); \
+  __ret = vdotq_s32(__s0, __s1, *(int8x16_t *) &__reint1); \
+  __ret; \
+})
+#else
+#define vdotq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+int8x16_t __reint = __rev2; \
+int32x4_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3, __p3, __p3); \
+  __ret = __noswap_vdotq_s32(__rev0, __rev1, *(int8x16_t *) &__reint1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdot_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x16_t __s2 = __p2; \
+  uint32x2_t __ret; \
+uint8x16_t __reint = __s2; \
+uint32x2_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3); \
+  __ret = vdot_u32(__s0, __s1, *(uint8x8_t *) &__reint1); \
+  __ret; \
+})
+#else
+#define vdot_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x16_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+uint8x16_t __reint = __rev2; \
+uint32x2_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3); \
+  __ret = __noswap_vdot_u32(__rev0, __rev1, *(uint8x8_t *) &__reint1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdot_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x16_t __s2 = __p2; \
+  int32x2_t __ret; \
+int8x16_t __reint = __s2; \
+int32x2_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3); \
+  __ret = vdot_s32(__s0, __s1, *(int8x8_t *) &__reint1); \
+  __ret; \
+})
+#else
+#define vdot_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x16_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+int8x16_t __reint = __rev2; \
+int32x2_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3); \
+  __ret = __noswap_vdot_s32(__rev0, __rev1, *(int8x8_t *) &__reint1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
 #endif
 
 #endif
@@ -40425,6 +44175,40 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = vfmaq_f32(__p0, __p1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vfmaq_f32(__rev0, __rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = vfma_f32(__p0, __p1, (float32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __noswap_vfma_f32(__rev0, __rev1, (float32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 __ai float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
   float32x4_t __ret;
   __ret = vfmaq_f32(__p0, -__p1, __p2);
@@ -40461,6 +44245,3098 @@
 #endif
 
 #endif
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vabd_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vabd_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vabsq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vabsq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vabs_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vabs_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vabs_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vaddq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float16x8_t vaddq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vadd_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float16x4_t vadd_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vbslq_f16(uint16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vbslq_f16(uint16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vbsl_f16(uint16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vbsl_f16(uint16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcageq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcageq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcage_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcage_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcage_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcagtq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcagtq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcagt_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcagt_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcagt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcaleq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcaleq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcale_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcale_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcale_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcaltq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcaltq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcalt_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcalt_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcalt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceq_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceq_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqzq_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqzq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceqz_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceqz_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgeq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgeq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcge_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcge_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgezq_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgezq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgez_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgez_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgtq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgtq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgt_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgt_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgtzq_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgtzq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgtz_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgtz_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcleq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcleq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcle_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcle_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vclezq_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vclezq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclez_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclez_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcltq_f16(float16x8_t __p0, float16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcltq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclt_f16(float16x4_t __p0, float16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclt_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcltzq_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcltzq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcltz_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcltz_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcvtq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai float16x8_t vcvtq_f16_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcvtq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai float16x8_t vcvtq_f16_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcvt_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai float16x4_t vcvt_f16_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcvt_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai float16x4_t vcvt_f16_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f16_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f16_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f16_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f16_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f16_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vcvt_n_f16_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f16_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vcvt_n_f16_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_s16_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vcvtq_n_s16_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vcvtq_n_s16_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vcvtq_n_s16_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_s16_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vcvt_n_s16_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vcvt_n_s16_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vcvt_n_s16_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_u16_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vcvtq_n_u16_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vcvtq_n_u16_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vcvtq_n_u16_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_u16_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vcvt_n_u16_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vcvt_n_u16_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vcvt_n_u16_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vcvtq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtq_s16_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vcvtq_s16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtq_s16_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcvt_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvt_s16_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vcvt_s16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvt_s16_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcvtq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtq_u16_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcvtq_u16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtq_u16_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcvt_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvt_u16_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcvt_u16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvt_u16_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vcvtaq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtaq_s16_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vcvtaq_s16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtaq_s16_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcvta_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvta_s16_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vcvta_s16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvta_s16_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcvtaq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtaq_u16_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcvtaq_u16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtaq_u16_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcvta_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvta_u16_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcvta_u16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvta_u16_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vcvtmq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtmq_s16_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vcvtmq_s16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtmq_s16_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcvtm_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvtm_s16_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vcvtm_s16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvtm_s16_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcvtmq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtmq_u16_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcvtmq_u16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtmq_u16_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcvtm_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvtm_u16_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcvtm_u16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvtm_u16_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vcvtnq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtnq_s16_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vcvtnq_s16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtnq_s16_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcvtn_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvtn_s16_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vcvtn_s16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvtn_s16_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcvtnq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtnq_u16_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcvtnq_u16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtnq_u16_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcvtn_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvtn_u16_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcvtn_u16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvtn_u16_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vcvtpq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtpq_s16_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vcvtpq_s16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vcvtpq_s16_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcvtp_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvtp_s16_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vcvtp_s16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcvtp_s16_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcvtpq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtpq_u16_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcvtpq_u16_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcvtpq_u16_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcvtp_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvtp_u16_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcvtp_u16_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcvtp_u16_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 40); \
+  __ret; \
+})
+#else
+#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 8); \
+  __ret; \
+})
+#else
+#define vext_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x8_t __noswap_vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = vfmaq_f16(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __noswap_vfmaq_f16(__rev0, -__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = vfma_f16(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __noswap_vfma_f16(__rev0, -__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_n_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = __s0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \
+  __ret; \
+})
+#else
+#define vmulq_n_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __rev0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_n_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = __s0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \
+  __ret; \
+})
+#else
+#define vmul_n_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __rev0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vnegq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float16x8_t vnegq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vneg_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float16x4_t vneg_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrecpeq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrecpeq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrecpe_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrecpe_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrev64q_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai float16x8_t vrev64q_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrev64_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai float16x4_t vrev64_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrsqrte_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrsqrte_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __rev0 / __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __rev0 / __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 40); \
+  __ret; \
+})
+#else
+#define vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, __p3, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 40); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 8); \
+  __ret; \
+})
+#else
+#define vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, __p3, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __s2 = __p2; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 8); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 40); \
+  __ret; \
+})
+#else
+#define vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 40); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 8); \
+  __ret; \
+})
+#else
+#define vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x16_t)__rev2, __p3, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x8_t __s2 = __p2; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 8); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_n_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16x8_t __ret; \
+  __ret = vfmaq_f16(__s0, __s1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \
+  __ret; \
+})
+#else
+#define vfmaq_n_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __noswap_vfmaq_f16(__rev0, __rev1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_n_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16x4_t __ret; \
+  __ret = vfma_f16(__s0, __s1, (float16x4_t) {__s2, __s2, __s2, __s2}); \
+  __ret; \
+})
+#else
+#define vfma_n_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __noswap_vfma_f16(__rev0, __rev1, (float16x4_t) {__s2, __s2, __s2, __s2}); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsh_lane_f16(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \
+  float16_t __s0_0 = __p0_0; \
+  float16_t __s1_0 = __p1_0; \
+  float16x4_t __s2_0 = __p2_0; \
+  float16_t __ret_0; \
+  __ret_0 = vfmah_lane_f16(__s0_0, -__s1_0, __s2_0, __p3_0); \
+  __ret_0; \
+})
+#else
+#define vfmsh_lane_f16(__p0_1, __p1_1, __p2_1, __p3_1) __extension__ ({ \
+  float16_t __s0_1 = __p0_1; \
+  float16_t __s1_1 = __p1_1; \
+  float16x4_t __s2_1 = __p2_1; \
+  float16x4_t __rev2_1;  __rev2_1 = __builtin_shufflevector(__s2_1, __s2_1, 3, 2, 1, 0); \
+  float16_t __ret_1; \
+  __ret_1 = __noswap_vfmah_lane_f16(__s0_1, -__s1_1, __rev2_1, __p3_1); \
+  __ret_1; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_lane_f16(__p0_2, __p1_2, __p2_2, __p3_2) __extension__ ({ \
+  float16x8_t __s0_2 = __p0_2; \
+  float16x8_t __s1_2 = __p1_2; \
+  float16x4_t __s2_2 = __p2_2; \
+  float16x8_t __ret_2; \
+  __ret_2 = vfmaq_lane_f16(__s0_2, -__s1_2, __s2_2, __p3_2); \
+  __ret_2; \
+})
+#else
+#define vfmsq_lane_f16(__p0_3, __p1_3, __p2_3, __p3_3) __extension__ ({ \
+  float16x8_t __s0_3 = __p0_3; \
+  float16x8_t __s1_3 = __p1_3; \
+  float16x4_t __s2_3 = __p2_3; \
+  float16x8_t __rev0_3;  __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_3;  __rev1_3 = __builtin_shufflevector(__s1_3, __s1_3, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_3;  __rev2_3 = __builtin_shufflevector(__s2_3, __s2_3, 3, 2, 1, 0); \
+  float16x8_t __ret_3; \
+  __ret_3 = __noswap_vfmaq_lane_f16(__rev0_3, -__rev1_3, __rev2_3, __p3_3); \
+  __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_3; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_lane_f16(__p0_4, __p1_4, __p2_4, __p3_4) __extension__ ({ \
+  float16x4_t __s0_4 = __p0_4; \
+  float16x4_t __s1_4 = __p1_4; \
+  float16x4_t __s2_4 = __p2_4; \
+  float16x4_t __ret_4; \
+  __ret_4 = vfma_lane_f16(__s0_4, -__s1_4, __s2_4, __p3_4); \
+  __ret_4; \
+})
+#else
+#define vfms_lane_f16(__p0_5, __p1_5, __p2_5, __p3_5) __extension__ ({ \
+  float16x4_t __s0_5 = __p0_5; \
+  float16x4_t __s1_5 = __p1_5; \
+  float16x4_t __s2_5 = __p2_5; \
+  float16x4_t __rev0_5;  __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 3, 2, 1, 0); \
+  float16x4_t __rev1_5;  __rev1_5 = __builtin_shufflevector(__s1_5, __s1_5, 3, 2, 1, 0); \
+  float16x4_t __rev2_5;  __rev2_5 = __builtin_shufflevector(__s2_5, __s2_5, 3, 2, 1, 0); \
+  float16x4_t __ret_5; \
+  __ret_5 = __noswap_vfma_lane_f16(__rev0_5, -__rev1_5, __rev2_5, __p3_5); \
+  __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 3, 2, 1, 0); \
+  __ret_5; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsh_laneq_f16(__p0_6, __p1_6, __p2_6, __p3_6) __extension__ ({ \
+  float16_t __s0_6 = __p0_6; \
+  float16_t __s1_6 = __p1_6; \
+  float16x8_t __s2_6 = __p2_6; \
+  float16_t __ret_6; \
+  __ret_6 = vfmah_laneq_f16(__s0_6, -__s1_6, __s2_6, __p3_6); \
+  __ret_6; \
+})
+#else
+#define vfmsh_laneq_f16(__p0_7, __p1_7, __p2_7, __p3_7) __extension__ ({ \
+  float16_t __s0_7 = __p0_7; \
+  float16_t __s1_7 = __p1_7; \
+  float16x8_t __s2_7 = __p2_7; \
+  float16x8_t __rev2_7;  __rev2_7 = __builtin_shufflevector(__s2_7, __s2_7, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret_7; \
+  __ret_7 = __noswap_vfmah_laneq_f16(__s0_7, -__s1_7, __rev2_7, __p3_7); \
+  __ret_7; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_laneq_f16(__p0_8, __p1_8, __p2_8, __p3_8) __extension__ ({ \
+  float16x8_t __s0_8 = __p0_8; \
+  float16x8_t __s1_8 = __p1_8; \
+  float16x8_t __s2_8 = __p2_8; \
+  float16x8_t __ret_8; \
+  __ret_8 = vfmaq_laneq_f16(__s0_8, -__s1_8, __s2_8, __p3_8); \
+  __ret_8; \
+})
+#else
+#define vfmsq_laneq_f16(__p0_9, __p1_9, __p2_9, __p3_9) __extension__ ({ \
+  float16x8_t __s0_9 = __p0_9; \
+  float16x8_t __s1_9 = __p1_9; \
+  float16x8_t __s2_9 = __p2_9; \
+  float16x8_t __rev0_9;  __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_9;  __rev1_9 = __builtin_shufflevector(__s1_9, __s1_9, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_9;  __rev2_9 = __builtin_shufflevector(__s2_9, __s2_9, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_9; \
+  __ret_9 = __noswap_vfmaq_laneq_f16(__rev0_9, -__rev1_9, __rev2_9, __p3_9); \
+  __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_9; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_laneq_f16(__p0_10, __p1_10, __p2_10, __p3_10) __extension__ ({ \
+  float16x4_t __s0_10 = __p0_10; \
+  float16x4_t __s1_10 = __p1_10; \
+  float16x8_t __s2_10 = __p2_10; \
+  float16x4_t __ret_10; \
+  __ret_10 = vfma_laneq_f16(__s0_10, -__s1_10, __s2_10, __p3_10); \
+  __ret_10; \
+})
+#else
+#define vfms_laneq_f16(__p0_11, __p1_11, __p2_11, __p3_11) __extension__ ({ \
+  float16x4_t __s0_11 = __p0_11; \
+  float16x4_t __s1_11 = __p1_11; \
+  float16x8_t __s2_11 = __p2_11; \
+  float16x4_t __rev0_11;  __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 3, 2, 1, 0); \
+  float16x4_t __rev1_11;  __rev1_11 = __builtin_shufflevector(__s1_11, __s1_11, 3, 2, 1, 0); \
+  float16x8_t __rev2_11;  __rev2_11 = __builtin_shufflevector(__s2_11, __s2_11, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_11; \
+  __ret_11 = __noswap_vfma_laneq_f16(__rev0_11, -__rev1_11, __rev2_11, __p3_11); \
+  __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 3, 2, 1, 0); \
+  __ret_11; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_n_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16x8_t __ret; \
+  __ret = vfmaq_f16(__s0, -__s1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \
+  __ret; \
+})
+#else
+#define vfmsq_n_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __noswap_vfmaq_f16(__rev0, -__rev1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_n_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16x4_t __ret; \
+  __ret = vfma_f16(__s0, -__s1, (float16x4_t) {__s2, __s2, __s2, __s2}); \
+  __ret; \
+})
+#else
+#define vfms_n_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __noswap_vfma_f16(__rev0, -__rev1, (float16x4_t) {__s2, __s2, __s2, __s2}); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmaxnmvq_f16(__p0) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxnmvq_f16((int8x16_t)__s0); \
+  __ret; \
+})
+#else
+#define vmaxnmvq_f16(__p0) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxnmvq_f16((int8x16_t)__rev0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmaxnmv_f16(__p0) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxnmv_f16((int8x8_t)__s0); \
+  __ret; \
+})
+#else
+#define vmaxnmv_f16(__p0) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxnmv_f16((int8x8_t)__rev0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmaxvq_f16(__p0) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxvq_f16((int8x16_t)__s0); \
+  __ret; \
+})
+#else
+#define vmaxvq_f16(__p0) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxvq_f16((int8x16_t)__rev0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmaxv_f16(__p0) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxv_f16((int8x8_t)__s0); \
+  __ret; \
+})
+#else
+#define vmaxv_f16(__p0) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmaxv_f16((int8x8_t)__rev0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vminnmvq_f16(__p0) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminnmvq_f16((int8x16_t)__s0); \
+  __ret; \
+})
+#else
+#define vminnmvq_f16(__p0) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminnmvq_f16((int8x16_t)__rev0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vminnmv_f16(__p0) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminnmv_f16((int8x8_t)__s0); \
+  __ret; \
+})
+#else
+#define vminnmv_f16(__p0) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminnmv_f16((int8x8_t)__rev0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vminvq_f16(__p0) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminvq_f16((int8x16_t)__s0); \
+  __ret; \
+})
+#else
+#define vminvq_f16(__p0) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminvq_f16((int8x16_t)__rev0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vminv_f16(__p0) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminv_f16((int8x8_t)__s0); \
+  __ret; \
+})
+#else
+#define vminv_f16(__p0) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vminv_f16((int8x8_t)__rev0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x8_t __noswap_vmulxq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vmulx_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vmulx_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxh_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmulxh_lane_f16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vmulxh_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmulxh_lane_f16(__s0, (int8x8_t)__rev1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = vmulxq_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __noswap_vmulxq_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = vmulx_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulx_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __noswap_vmulx_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxh_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmulxh_laneq_f16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vmulxh_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret; \
+  __ret = (float16_t) __builtin_neon_vmulxh_laneq_f16(__s0, (int8x16_t)__rev1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = vmulxq_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __noswap_vmulxq_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = vmulx_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulx_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x8_t __s1 = __p1; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __noswap_vmulx_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_n_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = vmulxq_f16(__s0, (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}); \
+  __ret; \
+})
+#else
+#define vmulxq_n_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __noswap_vmulxq_f16(__rev0, (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_n_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = vmulx_f16(__s0, (float16x4_t) {__s1, __s1, __s1, __s1}); \
+  __ret; \
+})
+#else
+#define vmulx_n_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __noswap_vmulx_f16(__rev0, (float16x4_t) {__s1, __s1, __s1, __s1}); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vpmaxnm_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vpmaxnm_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vpminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vrndiq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vrndiq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vrndi_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vrndi_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vsqrtq_f16(float16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vsqrtq_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vsqrt_f16(float16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vsqrt_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vsqrt_v((int8x8_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vtrn1_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai float16x4_t vtrn1_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vtrn2q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai float16x8_t vtrn2q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vuzp1_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai float16x4_t vuzp1_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vuzp2q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai float16x8_t vuzp2q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vzip1_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai float16x4_t vzip1_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vzip2q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai float16x8_t vzip2q_f16(float16x8_t __p0, float16x8_t __p1) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vzip2_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai float16x4_t vzip2_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
 #if defined(__ARM_FEATURE_QRDMX)
 #ifdef __LITTLE_ENDIAN__
 __ai int32x4_t vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
@@ -44220,918 +51096,918 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p8(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \
-  poly8x16_t __s0_0 = __p0_0; \
-  poly8x8_t __s2_0 = __p2_0; \
-  poly8x16_t __ret_0; \
-  __ret_0 = vsetq_lane_p8(vget_lane_p8(__s2_0, __p3_0), __s0_0, __p1_0); \
-  __ret_0; \
-})
-#else
-#define vcopyq_lane_p8(__p0_1, __p1_1, __p2_1, __p3_1) __extension__ ({ \
-  poly8x16_t __s0_1 = __p0_1; \
-  poly8x8_t __s2_1 = __p2_1; \
-  poly8x16_t __rev0_1;  __rev0_1 = __builtin_shufflevector(__s0_1, __s0_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev2_1;  __rev2_1 = __builtin_shufflevector(__s2_1, __s2_1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __ret_1; \
-  __ret_1 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_1, __p3_1), __rev0_1, __p1_1); \
-  __ret_1 = __builtin_shufflevector(__ret_1, __ret_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_1; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p16(__p0_2, __p1_2, __p2_2, __p3_2) __extension__ ({ \
-  poly16x8_t __s0_2 = __p0_2; \
-  poly16x4_t __s2_2 = __p2_2; \
-  poly16x8_t __ret_2; \
-  __ret_2 = vsetq_lane_p16(vget_lane_p16(__s2_2, __p3_2), __s0_2, __p1_2); \
-  __ret_2; \
-})
-#else
-#define vcopyq_lane_p16(__p0_3, __p1_3, __p2_3, __p3_3) __extension__ ({ \
-  poly16x8_t __s0_3 = __p0_3; \
-  poly16x4_t __s2_3 = __p2_3; \
-  poly16x8_t __rev0_3;  __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x4_t __rev2_3;  __rev2_3 = __builtin_shufflevector(__s2_3, __s2_3, 3, 2, 1, 0); \
-  poly16x8_t __ret_3; \
-  __ret_3 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_3, __p3_3), __rev0_3, __p1_3); \
-  __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_3; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u8(__p0_4, __p1_4, __p2_4, __p3_4) __extension__ ({ \
-  uint8x16_t __s0_4 = __p0_4; \
-  uint8x8_t __s2_4 = __p2_4; \
-  uint8x16_t __ret_4; \
-  __ret_4 = vsetq_lane_u8(vget_lane_u8(__s2_4, __p3_4), __s0_4, __p1_4); \
-  __ret_4; \
-})
-#else
-#define vcopyq_lane_u8(__p0_5, __p1_5, __p2_5, __p3_5) __extension__ ({ \
-  uint8x16_t __s0_5 = __p0_5; \
-  uint8x8_t __s2_5 = __p2_5; \
-  uint8x16_t __rev0_5;  __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_5;  __rev2_5 = __builtin_shufflevector(__s2_5, __s2_5, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_5; \
-  __ret_5 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_5, __p3_5), __rev0_5, __p1_5); \
-  __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_5; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u32(__p0_6, __p1_6, __p2_6, __p3_6) __extension__ ({ \
-  uint32x4_t __s0_6 = __p0_6; \
-  uint32x2_t __s2_6 = __p2_6; \
-  uint32x4_t __ret_6; \
-  __ret_6 = vsetq_lane_u32(vget_lane_u32(__s2_6, __p3_6), __s0_6, __p1_6); \
-  __ret_6; \
-})
-#else
-#define vcopyq_lane_u32(__p0_7, __p1_7, __p2_7, __p3_7) __extension__ ({ \
-  uint32x4_t __s0_7 = __p0_7; \
-  uint32x2_t __s2_7 = __p2_7; \
-  uint32x4_t __rev0_7;  __rev0_7 = __builtin_shufflevector(__s0_7, __s0_7, 3, 2, 1, 0); \
-  uint32x2_t __rev2_7;  __rev2_7 = __builtin_shufflevector(__s2_7, __s2_7, 1, 0); \
-  uint32x4_t __ret_7; \
-  __ret_7 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_7, __p3_7), __rev0_7, __p1_7); \
-  __ret_7 = __builtin_shufflevector(__ret_7, __ret_7, 3, 2, 1, 0); \
-  __ret_7; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u64(__p0_8, __p1_8, __p2_8, __p3_8) __extension__ ({ \
-  uint64x2_t __s0_8 = __p0_8; \
-  uint64x1_t __s2_8 = __p2_8; \
-  uint64x2_t __ret_8; \
-  __ret_8 = vsetq_lane_u64(vget_lane_u64(__s2_8, __p3_8), __s0_8, __p1_8); \
-  __ret_8; \
-})
-#else
-#define vcopyq_lane_u64(__p0_9, __p1_9, __p2_9, __p3_9) __extension__ ({ \
-  uint64x2_t __s0_9 = __p0_9; \
-  uint64x1_t __s2_9 = __p2_9; \
-  uint64x2_t __rev0_9;  __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 1, 0); \
-  uint64x2_t __ret_9; \
-  __ret_9 = __noswap_vsetq_lane_u64(__noswap_vget_lane_u64(__s2_9, __p3_9), __rev0_9, __p1_9); \
-  __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 1, 0); \
-  __ret_9; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u16(__p0_10, __p1_10, __p2_10, __p3_10) __extension__ ({ \
-  uint16x8_t __s0_10 = __p0_10; \
-  uint16x4_t __s2_10 = __p2_10; \
-  uint16x8_t __ret_10; \
-  __ret_10 = vsetq_lane_u16(vget_lane_u16(__s2_10, __p3_10), __s0_10, __p1_10); \
-  __ret_10; \
-})
-#else
-#define vcopyq_lane_u16(__p0_11, __p1_11, __p2_11, __p3_11) __extension__ ({ \
-  uint16x8_t __s0_11 = __p0_11; \
-  uint16x4_t __s2_11 = __p2_11; \
-  uint16x8_t __rev0_11;  __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_11;  __rev2_11 = __builtin_shufflevector(__s2_11, __s2_11, 3, 2, 1, 0); \
-  uint16x8_t __ret_11; \
-  __ret_11 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_11, __p3_11), __rev0_11, __p1_11); \
-  __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_11; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s8(__p0_12, __p1_12, __p2_12, __p3_12) __extension__ ({ \
-  int8x16_t __s0_12 = __p0_12; \
-  int8x8_t __s2_12 = __p2_12; \
-  int8x16_t __ret_12; \
-  __ret_12 = vsetq_lane_s8(vget_lane_s8(__s2_12, __p3_12), __s0_12, __p1_12); \
+#define vcopyq_lane_p8(__p0_12, __p1_12, __p2_12, __p3_12) __extension__ ({ \
+  poly8x16_t __s0_12 = __p0_12; \
+  poly8x8_t __s2_12 = __p2_12; \
+  poly8x16_t __ret_12; \
+  __ret_12 = vsetq_lane_p8(vget_lane_p8(__s2_12, __p3_12), __s0_12, __p1_12); \
   __ret_12; \
 })
 #else
-#define vcopyq_lane_s8(__p0_13, __p1_13, __p2_13, __p3_13) __extension__ ({ \
-  int8x16_t __s0_13 = __p0_13; \
-  int8x8_t __s2_13 = __p2_13; \
-  int8x16_t __rev0_13;  __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_13;  __rev2_13 = __builtin_shufflevector(__s2_13, __s2_13, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_13; \
-  __ret_13 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_13, __p3_13), __rev0_13, __p1_13); \
+#define vcopyq_lane_p8(__p0_13, __p1_13, __p2_13, __p3_13) __extension__ ({ \
+  poly8x16_t __s0_13 = __p0_13; \
+  poly8x8_t __s2_13 = __p2_13; \
+  poly8x16_t __rev0_13;  __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev2_13;  __rev2_13 = __builtin_shufflevector(__s2_13, __s2_13, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_13; \
+  __ret_13 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_13, __p3_13), __rev0_13, __p1_13); \
   __ret_13 = __builtin_shufflevector(__ret_13, __ret_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_13; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_f32(__p0_14, __p1_14, __p2_14, __p3_14) __extension__ ({ \
-  float32x4_t __s0_14 = __p0_14; \
-  float32x2_t __s2_14 = __p2_14; \
-  float32x4_t __ret_14; \
-  __ret_14 = vsetq_lane_f32(vget_lane_f32(__s2_14, __p3_14), __s0_14, __p1_14); \
+#define vcopyq_lane_p16(__p0_14, __p1_14, __p2_14, __p3_14) __extension__ ({ \
+  poly16x8_t __s0_14 = __p0_14; \
+  poly16x4_t __s2_14 = __p2_14; \
+  poly16x8_t __ret_14; \
+  __ret_14 = vsetq_lane_p16(vget_lane_p16(__s2_14, __p3_14), __s0_14, __p1_14); \
   __ret_14; \
 })
 #else
-#define vcopyq_lane_f32(__p0_15, __p1_15, __p2_15, __p3_15) __extension__ ({ \
-  float32x4_t __s0_15 = __p0_15; \
-  float32x2_t __s2_15 = __p2_15; \
-  float32x4_t __rev0_15;  __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 3, 2, 1, 0); \
-  float32x2_t __rev2_15;  __rev2_15 = __builtin_shufflevector(__s2_15, __s2_15, 1, 0); \
-  float32x4_t __ret_15; \
-  __ret_15 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_15, __p3_15), __rev0_15, __p1_15); \
-  __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 3, 2, 1, 0); \
+#define vcopyq_lane_p16(__p0_15, __p1_15, __p2_15, __p3_15) __extension__ ({ \
+  poly16x8_t __s0_15 = __p0_15; \
+  poly16x4_t __s2_15 = __p2_15; \
+  poly16x8_t __rev0_15;  __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __rev2_15;  __rev2_15 = __builtin_shufflevector(__s2_15, __s2_15, 3, 2, 1, 0); \
+  poly16x8_t __ret_15; \
+  __ret_15 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_15, __p3_15), __rev0_15, __p1_15); \
+  __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_15; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s32(__p0_16, __p1_16, __p2_16, __p3_16) __extension__ ({ \
-  int32x4_t __s0_16 = __p0_16; \
-  int32x2_t __s2_16 = __p2_16; \
-  int32x4_t __ret_16; \
-  __ret_16 = vsetq_lane_s32(vget_lane_s32(__s2_16, __p3_16), __s0_16, __p1_16); \
+#define vcopyq_lane_u8(__p0_16, __p1_16, __p2_16, __p3_16) __extension__ ({ \
+  uint8x16_t __s0_16 = __p0_16; \
+  uint8x8_t __s2_16 = __p2_16; \
+  uint8x16_t __ret_16; \
+  __ret_16 = vsetq_lane_u8(vget_lane_u8(__s2_16, __p3_16), __s0_16, __p1_16); \
   __ret_16; \
 })
 #else
-#define vcopyq_lane_s32(__p0_17, __p1_17, __p2_17, __p3_17) __extension__ ({ \
-  int32x4_t __s0_17 = __p0_17; \
-  int32x2_t __s2_17 = __p2_17; \
-  int32x4_t __rev0_17;  __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 3, 2, 1, 0); \
-  int32x2_t __rev2_17;  __rev2_17 = __builtin_shufflevector(__s2_17, __s2_17, 1, 0); \
-  int32x4_t __ret_17; \
-  __ret_17 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_17, __p3_17), __rev0_17, __p1_17); \
-  __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 3, 2, 1, 0); \
+#define vcopyq_lane_u8(__p0_17, __p1_17, __p2_17, __p3_17) __extension__ ({ \
+  uint8x16_t __s0_17 = __p0_17; \
+  uint8x8_t __s2_17 = __p2_17; \
+  uint8x16_t __rev0_17;  __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_17;  __rev2_17 = __builtin_shufflevector(__s2_17, __s2_17, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_17; \
+  __ret_17 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_17, __p3_17), __rev0_17, __p1_17); \
+  __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_17; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s64(__p0_18, __p1_18, __p2_18, __p3_18) __extension__ ({ \
-  int64x2_t __s0_18 = __p0_18; \
-  int64x1_t __s2_18 = __p2_18; \
-  int64x2_t __ret_18; \
-  __ret_18 = vsetq_lane_s64(vget_lane_s64(__s2_18, __p3_18), __s0_18, __p1_18); \
+#define vcopyq_lane_u32(__p0_18, __p1_18, __p2_18, __p3_18) __extension__ ({ \
+  uint32x4_t __s0_18 = __p0_18; \
+  uint32x2_t __s2_18 = __p2_18; \
+  uint32x4_t __ret_18; \
+  __ret_18 = vsetq_lane_u32(vget_lane_u32(__s2_18, __p3_18), __s0_18, __p1_18); \
   __ret_18; \
 })
 #else
-#define vcopyq_lane_s64(__p0_19, __p1_19, __p2_19, __p3_19) __extension__ ({ \
-  int64x2_t __s0_19 = __p0_19; \
-  int64x1_t __s2_19 = __p2_19; \
-  int64x2_t __rev0_19;  __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 1, 0); \
-  int64x2_t __ret_19; \
-  __ret_19 = __noswap_vsetq_lane_s64(__noswap_vget_lane_s64(__s2_19, __p3_19), __rev0_19, __p1_19); \
-  __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 1, 0); \
+#define vcopyq_lane_u32(__p0_19, __p1_19, __p2_19, __p3_19) __extension__ ({ \
+  uint32x4_t __s0_19 = __p0_19; \
+  uint32x2_t __s2_19 = __p2_19; \
+  uint32x4_t __rev0_19;  __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 3, 2, 1, 0); \
+  uint32x2_t __rev2_19;  __rev2_19 = __builtin_shufflevector(__s2_19, __s2_19, 1, 0); \
+  uint32x4_t __ret_19; \
+  __ret_19 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_19, __p3_19), __rev0_19, __p1_19); \
+  __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 3, 2, 1, 0); \
   __ret_19; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s16(__p0_20, __p1_20, __p2_20, __p3_20) __extension__ ({ \
-  int16x8_t __s0_20 = __p0_20; \
-  int16x4_t __s2_20 = __p2_20; \
-  int16x8_t __ret_20; \
-  __ret_20 = vsetq_lane_s16(vget_lane_s16(__s2_20, __p3_20), __s0_20, __p1_20); \
+#define vcopyq_lane_u64(__p0_20, __p1_20, __p2_20, __p3_20) __extension__ ({ \
+  uint64x2_t __s0_20 = __p0_20; \
+  uint64x1_t __s2_20 = __p2_20; \
+  uint64x2_t __ret_20; \
+  __ret_20 = vsetq_lane_u64(vget_lane_u64(__s2_20, __p3_20), __s0_20, __p1_20); \
   __ret_20; \
 })
 #else
-#define vcopyq_lane_s16(__p0_21, __p1_21, __p2_21, __p3_21) __extension__ ({ \
-  int16x8_t __s0_21 = __p0_21; \
-  int16x4_t __s2_21 = __p2_21; \
-  int16x8_t __rev0_21;  __rev0_21 = __builtin_shufflevector(__s0_21, __s0_21, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_21;  __rev2_21 = __builtin_shufflevector(__s2_21, __s2_21, 3, 2, 1, 0); \
-  int16x8_t __ret_21; \
-  __ret_21 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_21, __p3_21), __rev0_21, __p1_21); \
-  __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vcopyq_lane_u64(__p0_21, __p1_21, __p2_21, __p3_21) __extension__ ({ \
+  uint64x2_t __s0_21 = __p0_21; \
+  uint64x1_t __s2_21 = __p2_21; \
+  uint64x2_t __rev0_21;  __rev0_21 = __builtin_shufflevector(__s0_21, __s0_21, 1, 0); \
+  uint64x2_t __ret_21; \
+  __ret_21 = __noswap_vsetq_lane_u64(__noswap_vget_lane_u64(__s2_21, __p3_21), __rev0_21, __p1_21); \
+  __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 1, 0); \
   __ret_21; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_p8(__p0_22, __p1_22, __p2_22, __p3_22) __extension__ ({ \
-  poly8x8_t __s0_22 = __p0_22; \
-  poly8x8_t __s2_22 = __p2_22; \
-  poly8x8_t __ret_22; \
-  __ret_22 = vset_lane_p8(vget_lane_p8(__s2_22, __p3_22), __s0_22, __p1_22); \
+#define vcopyq_lane_u16(__p0_22, __p1_22, __p2_22, __p3_22) __extension__ ({ \
+  uint16x8_t __s0_22 = __p0_22; \
+  uint16x4_t __s2_22 = __p2_22; \
+  uint16x8_t __ret_22; \
+  __ret_22 = vsetq_lane_u16(vget_lane_u16(__s2_22, __p3_22), __s0_22, __p1_22); \
   __ret_22; \
 })
 #else
-#define vcopy_lane_p8(__p0_23, __p1_23, __p2_23, __p3_23) __extension__ ({ \
-  poly8x8_t __s0_23 = __p0_23; \
-  poly8x8_t __s2_23 = __p2_23; \
-  poly8x8_t __rev0_23;  __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev2_23;  __rev2_23 = __builtin_shufflevector(__s2_23, __s2_23, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __ret_23; \
-  __ret_23 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_23, __p3_23), __rev0_23, __p1_23); \
+#define vcopyq_lane_u16(__p0_23, __p1_23, __p2_23, __p3_23) __extension__ ({ \
+  uint16x8_t __s0_23 = __p0_23; \
+  uint16x4_t __s2_23 = __p2_23; \
+  uint16x8_t __rev0_23;  __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2_23;  __rev2_23 = __builtin_shufflevector(__s2_23, __s2_23, 3, 2, 1, 0); \
+  uint16x8_t __ret_23; \
+  __ret_23 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_23, __p3_23), __rev0_23, __p1_23); \
   __ret_23 = __builtin_shufflevector(__ret_23, __ret_23, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_23; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_p16(__p0_24, __p1_24, __p2_24, __p3_24) __extension__ ({ \
-  poly16x4_t __s0_24 = __p0_24; \
-  poly16x4_t __s2_24 = __p2_24; \
-  poly16x4_t __ret_24; \
-  __ret_24 = vset_lane_p16(vget_lane_p16(__s2_24, __p3_24), __s0_24, __p1_24); \
+#define vcopyq_lane_s8(__p0_24, __p1_24, __p2_24, __p3_24) __extension__ ({ \
+  int8x16_t __s0_24 = __p0_24; \
+  int8x8_t __s2_24 = __p2_24; \
+  int8x16_t __ret_24; \
+  __ret_24 = vsetq_lane_s8(vget_lane_s8(__s2_24, __p3_24), __s0_24, __p1_24); \
   __ret_24; \
 })
 #else
-#define vcopy_lane_p16(__p0_25, __p1_25, __p2_25, __p3_25) __extension__ ({ \
-  poly16x4_t __s0_25 = __p0_25; \
-  poly16x4_t __s2_25 = __p2_25; \
-  poly16x4_t __rev0_25;  __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 3, 2, 1, 0); \
-  poly16x4_t __rev2_25;  __rev2_25 = __builtin_shufflevector(__s2_25, __s2_25, 3, 2, 1, 0); \
-  poly16x4_t __ret_25; \
-  __ret_25 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_25, __p3_25), __rev0_25, __p1_25); \
-  __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 3, 2, 1, 0); \
+#define vcopyq_lane_s8(__p0_25, __p1_25, __p2_25, __p3_25) __extension__ ({ \
+  int8x16_t __s0_25 = __p0_25; \
+  int8x8_t __s2_25 = __p2_25; \
+  int8x16_t __rev0_25;  __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_25;  __rev2_25 = __builtin_shufflevector(__s2_25, __s2_25, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_25; \
+  __ret_25 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_25, __p3_25), __rev0_25, __p1_25); \
+  __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_25; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u8(__p0_26, __p1_26, __p2_26, __p3_26) __extension__ ({ \
-  uint8x8_t __s0_26 = __p0_26; \
-  uint8x8_t __s2_26 = __p2_26; \
-  uint8x8_t __ret_26; \
-  __ret_26 = vset_lane_u8(vget_lane_u8(__s2_26, __p3_26), __s0_26, __p1_26); \
+#define vcopyq_lane_f32(__p0_26, __p1_26, __p2_26, __p3_26) __extension__ ({ \
+  float32x4_t __s0_26 = __p0_26; \
+  float32x2_t __s2_26 = __p2_26; \
+  float32x4_t __ret_26; \
+  __ret_26 = vsetq_lane_f32(vget_lane_f32(__s2_26, __p3_26), __s0_26, __p1_26); \
   __ret_26; \
 })
 #else
-#define vcopy_lane_u8(__p0_27, __p1_27, __p2_27, __p3_27) __extension__ ({ \
-  uint8x8_t __s0_27 = __p0_27; \
-  uint8x8_t __s2_27 = __p2_27; \
-  uint8x8_t __rev0_27;  __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_27;  __rev2_27 = __builtin_shufflevector(__s2_27, __s2_27, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __ret_27; \
-  __ret_27 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_27, __p3_27), __rev0_27, __p1_27); \
-  __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vcopyq_lane_f32(__p0_27, __p1_27, __p2_27, __p3_27) __extension__ ({ \
+  float32x4_t __s0_27 = __p0_27; \
+  float32x2_t __s2_27 = __p2_27; \
+  float32x4_t __rev0_27;  __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 3, 2, 1, 0); \
+  float32x2_t __rev2_27;  __rev2_27 = __builtin_shufflevector(__s2_27, __s2_27, 1, 0); \
+  float32x4_t __ret_27; \
+  __ret_27 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_27, __p3_27), __rev0_27, __p1_27); \
+  __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 3, 2, 1, 0); \
   __ret_27; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u32(__p0_28, __p1_28, __p2_28, __p3_28) __extension__ ({ \
-  uint32x2_t __s0_28 = __p0_28; \
-  uint32x2_t __s2_28 = __p2_28; \
-  uint32x2_t __ret_28; \
-  __ret_28 = vset_lane_u32(vget_lane_u32(__s2_28, __p3_28), __s0_28, __p1_28); \
+#define vcopyq_lane_s32(__p0_28, __p1_28, __p2_28, __p3_28) __extension__ ({ \
+  int32x4_t __s0_28 = __p0_28; \
+  int32x2_t __s2_28 = __p2_28; \
+  int32x4_t __ret_28; \
+  __ret_28 = vsetq_lane_s32(vget_lane_s32(__s2_28, __p3_28), __s0_28, __p1_28); \
   __ret_28; \
 })
 #else
-#define vcopy_lane_u32(__p0_29, __p1_29, __p2_29, __p3_29) __extension__ ({ \
-  uint32x2_t __s0_29 = __p0_29; \
-  uint32x2_t __s2_29 = __p2_29; \
-  uint32x2_t __rev0_29;  __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 1, 0); \
-  uint32x2_t __rev2_29;  __rev2_29 = __builtin_shufflevector(__s2_29, __s2_29, 1, 0); \
-  uint32x2_t __ret_29; \
-  __ret_29 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_29, __p3_29), __rev0_29, __p1_29); \
-  __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 1, 0); \
+#define vcopyq_lane_s32(__p0_29, __p1_29, __p2_29, __p3_29) __extension__ ({ \
+  int32x4_t __s0_29 = __p0_29; \
+  int32x2_t __s2_29 = __p2_29; \
+  int32x4_t __rev0_29;  __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 3, 2, 1, 0); \
+  int32x2_t __rev2_29;  __rev2_29 = __builtin_shufflevector(__s2_29, __s2_29, 1, 0); \
+  int32x4_t __ret_29; \
+  __ret_29 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_29, __p3_29), __rev0_29, __p1_29); \
+  __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 3, 2, 1, 0); \
   __ret_29; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u64(__p0_30, __p1_30, __p2_30, __p3_30) __extension__ ({ \
-  uint64x1_t __s0_30 = __p0_30; \
-  uint64x1_t __s2_30 = __p2_30; \
-  uint64x1_t __ret_30; \
-  __ret_30 = vset_lane_u64(vget_lane_u64(__s2_30, __p3_30), __s0_30, __p1_30); \
+#define vcopyq_lane_s64(__p0_30, __p1_30, __p2_30, __p3_30) __extension__ ({ \
+  int64x2_t __s0_30 = __p0_30; \
+  int64x1_t __s2_30 = __p2_30; \
+  int64x2_t __ret_30; \
+  __ret_30 = vsetq_lane_s64(vget_lane_s64(__s2_30, __p3_30), __s0_30, __p1_30); \
   __ret_30; \
 })
 #else
-#define vcopy_lane_u64(__p0_31, __p1_31, __p2_31, __p3_31) __extension__ ({ \
-  uint64x1_t __s0_31 = __p0_31; \
-  uint64x1_t __s2_31 = __p2_31; \
-  uint64x1_t __ret_31; \
-  __ret_31 = __noswap_vset_lane_u64(__noswap_vget_lane_u64(__s2_31, __p3_31), __s0_31, __p1_31); \
+#define vcopyq_lane_s64(__p0_31, __p1_31, __p2_31, __p3_31) __extension__ ({ \
+  int64x2_t __s0_31 = __p0_31; \
+  int64x1_t __s2_31 = __p2_31; \
+  int64x2_t __rev0_31;  __rev0_31 = __builtin_shufflevector(__s0_31, __s0_31, 1, 0); \
+  int64x2_t __ret_31; \
+  __ret_31 = __noswap_vsetq_lane_s64(__noswap_vget_lane_s64(__s2_31, __p3_31), __rev0_31, __p1_31); \
+  __ret_31 = __builtin_shufflevector(__ret_31, __ret_31, 1, 0); \
   __ret_31; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u16(__p0_32, __p1_32, __p2_32, __p3_32) __extension__ ({ \
-  uint16x4_t __s0_32 = __p0_32; \
-  uint16x4_t __s2_32 = __p2_32; \
-  uint16x4_t __ret_32; \
-  __ret_32 = vset_lane_u16(vget_lane_u16(__s2_32, __p3_32), __s0_32, __p1_32); \
+#define vcopyq_lane_s16(__p0_32, __p1_32, __p2_32, __p3_32) __extension__ ({ \
+  int16x8_t __s0_32 = __p0_32; \
+  int16x4_t __s2_32 = __p2_32; \
+  int16x8_t __ret_32; \
+  __ret_32 = vsetq_lane_s16(vget_lane_s16(__s2_32, __p3_32), __s0_32, __p1_32); \
   __ret_32; \
 })
 #else
-#define vcopy_lane_u16(__p0_33, __p1_33, __p2_33, __p3_33) __extension__ ({ \
-  uint16x4_t __s0_33 = __p0_33; \
-  uint16x4_t __s2_33 = __p2_33; \
-  uint16x4_t __rev0_33;  __rev0_33 = __builtin_shufflevector(__s0_33, __s0_33, 3, 2, 1, 0); \
-  uint16x4_t __rev2_33;  __rev2_33 = __builtin_shufflevector(__s2_33, __s2_33, 3, 2, 1, 0); \
-  uint16x4_t __ret_33; \
-  __ret_33 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_33, __p3_33), __rev0_33, __p1_33); \
-  __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 3, 2, 1, 0); \
+#define vcopyq_lane_s16(__p0_33, __p1_33, __p2_33, __p3_33) __extension__ ({ \
+  int16x8_t __s0_33 = __p0_33; \
+  int16x4_t __s2_33 = __p2_33; \
+  int16x8_t __rev0_33;  __rev0_33 = __builtin_shufflevector(__s0_33, __s0_33, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_33;  __rev2_33 = __builtin_shufflevector(__s2_33, __s2_33, 3, 2, 1, 0); \
+  int16x8_t __ret_33; \
+  __ret_33 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_33, __p3_33), __rev0_33, __p1_33); \
+  __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_33; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s8(__p0_34, __p1_34, __p2_34, __p3_34) __extension__ ({ \
-  int8x8_t __s0_34 = __p0_34; \
-  int8x8_t __s2_34 = __p2_34; \
-  int8x8_t __ret_34; \
-  __ret_34 = vset_lane_s8(vget_lane_s8(__s2_34, __p3_34), __s0_34, __p1_34); \
+#define vcopy_lane_p8(__p0_34, __p1_34, __p2_34, __p3_34) __extension__ ({ \
+  poly8x8_t __s0_34 = __p0_34; \
+  poly8x8_t __s2_34 = __p2_34; \
+  poly8x8_t __ret_34; \
+  __ret_34 = vset_lane_p8(vget_lane_p8(__s2_34, __p3_34), __s0_34, __p1_34); \
   __ret_34; \
 })
 #else
-#define vcopy_lane_s8(__p0_35, __p1_35, __p2_35, __p3_35) __extension__ ({ \
-  int8x8_t __s0_35 = __p0_35; \
-  int8x8_t __s2_35 = __p2_35; \
-  int8x8_t __rev0_35;  __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_35;  __rev2_35 = __builtin_shufflevector(__s2_35, __s2_35, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __ret_35; \
-  __ret_35 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_35, __p3_35), __rev0_35, __p1_35); \
+#define vcopy_lane_p8(__p0_35, __p1_35, __p2_35, __p3_35) __extension__ ({ \
+  poly8x8_t __s0_35 = __p0_35; \
+  poly8x8_t __s2_35 = __p2_35; \
+  poly8x8_t __rev0_35;  __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev2_35;  __rev2_35 = __builtin_shufflevector(__s2_35, __s2_35, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_35; \
+  __ret_35 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_35, __p3_35), __rev0_35, __p1_35); \
   __ret_35 = __builtin_shufflevector(__ret_35, __ret_35, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_35; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_f32(__p0_36, __p1_36, __p2_36, __p3_36) __extension__ ({ \
-  float32x2_t __s0_36 = __p0_36; \
-  float32x2_t __s2_36 = __p2_36; \
-  float32x2_t __ret_36; \
-  __ret_36 = vset_lane_f32(vget_lane_f32(__s2_36, __p3_36), __s0_36, __p1_36); \
+#define vcopy_lane_p16(__p0_36, __p1_36, __p2_36, __p3_36) __extension__ ({ \
+  poly16x4_t __s0_36 = __p0_36; \
+  poly16x4_t __s2_36 = __p2_36; \
+  poly16x4_t __ret_36; \
+  __ret_36 = vset_lane_p16(vget_lane_p16(__s2_36, __p3_36), __s0_36, __p1_36); \
   __ret_36; \
 })
 #else
-#define vcopy_lane_f32(__p0_37, __p1_37, __p2_37, __p3_37) __extension__ ({ \
-  float32x2_t __s0_37 = __p0_37; \
-  float32x2_t __s2_37 = __p2_37; \
-  float32x2_t __rev0_37;  __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 1, 0); \
-  float32x2_t __rev2_37;  __rev2_37 = __builtin_shufflevector(__s2_37, __s2_37, 1, 0); \
-  float32x2_t __ret_37; \
-  __ret_37 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_37, __p3_37), __rev0_37, __p1_37); \
-  __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 1, 0); \
+#define vcopy_lane_p16(__p0_37, __p1_37, __p2_37, __p3_37) __extension__ ({ \
+  poly16x4_t __s0_37 = __p0_37; \
+  poly16x4_t __s2_37 = __p2_37; \
+  poly16x4_t __rev0_37;  __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 3, 2, 1, 0); \
+  poly16x4_t __rev2_37;  __rev2_37 = __builtin_shufflevector(__s2_37, __s2_37, 3, 2, 1, 0); \
+  poly16x4_t __ret_37; \
+  __ret_37 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_37, __p3_37), __rev0_37, __p1_37); \
+  __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 3, 2, 1, 0); \
   __ret_37; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s32(__p0_38, __p1_38, __p2_38, __p3_38) __extension__ ({ \
-  int32x2_t __s0_38 = __p0_38; \
-  int32x2_t __s2_38 = __p2_38; \
-  int32x2_t __ret_38; \
-  __ret_38 = vset_lane_s32(vget_lane_s32(__s2_38, __p3_38), __s0_38, __p1_38); \
+#define vcopy_lane_u8(__p0_38, __p1_38, __p2_38, __p3_38) __extension__ ({ \
+  uint8x8_t __s0_38 = __p0_38; \
+  uint8x8_t __s2_38 = __p2_38; \
+  uint8x8_t __ret_38; \
+  __ret_38 = vset_lane_u8(vget_lane_u8(__s2_38, __p3_38), __s0_38, __p1_38); \
   __ret_38; \
 })
 #else
-#define vcopy_lane_s32(__p0_39, __p1_39, __p2_39, __p3_39) __extension__ ({ \
-  int32x2_t __s0_39 = __p0_39; \
-  int32x2_t __s2_39 = __p2_39; \
-  int32x2_t __rev0_39;  __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 1, 0); \
-  int32x2_t __rev2_39;  __rev2_39 = __builtin_shufflevector(__s2_39, __s2_39, 1, 0); \
-  int32x2_t __ret_39; \
-  __ret_39 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_39, __p3_39), __rev0_39, __p1_39); \
-  __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 1, 0); \
+#define vcopy_lane_u8(__p0_39, __p1_39, __p2_39, __p3_39) __extension__ ({ \
+  uint8x8_t __s0_39 = __p0_39; \
+  uint8x8_t __s2_39 = __p2_39; \
+  uint8x8_t __rev0_39;  __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_39;  __rev2_39 = __builtin_shufflevector(__s2_39, __s2_39, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_39; \
+  __ret_39 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_39, __p3_39), __rev0_39, __p1_39); \
+  __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_39; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s64(__p0_40, __p1_40, __p2_40, __p3_40) __extension__ ({ \
-  int64x1_t __s0_40 = __p0_40; \
-  int64x1_t __s2_40 = __p2_40; \
-  int64x1_t __ret_40; \
-  __ret_40 = vset_lane_s64(vget_lane_s64(__s2_40, __p3_40), __s0_40, __p1_40); \
+#define vcopy_lane_u32(__p0_40, __p1_40, __p2_40, __p3_40) __extension__ ({ \
+  uint32x2_t __s0_40 = __p0_40; \
+  uint32x2_t __s2_40 = __p2_40; \
+  uint32x2_t __ret_40; \
+  __ret_40 = vset_lane_u32(vget_lane_u32(__s2_40, __p3_40), __s0_40, __p1_40); \
   __ret_40; \
 })
 #else
-#define vcopy_lane_s64(__p0_41, __p1_41, __p2_41, __p3_41) __extension__ ({ \
-  int64x1_t __s0_41 = __p0_41; \
-  int64x1_t __s2_41 = __p2_41; \
-  int64x1_t __ret_41; \
-  __ret_41 = __noswap_vset_lane_s64(__noswap_vget_lane_s64(__s2_41, __p3_41), __s0_41, __p1_41); \
+#define vcopy_lane_u32(__p0_41, __p1_41, __p2_41, __p3_41) __extension__ ({ \
+  uint32x2_t __s0_41 = __p0_41; \
+  uint32x2_t __s2_41 = __p2_41; \
+  uint32x2_t __rev0_41;  __rev0_41 = __builtin_shufflevector(__s0_41, __s0_41, 1, 0); \
+  uint32x2_t __rev2_41;  __rev2_41 = __builtin_shufflevector(__s2_41, __s2_41, 1, 0); \
+  uint32x2_t __ret_41; \
+  __ret_41 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_41, __p3_41), __rev0_41, __p1_41); \
+  __ret_41 = __builtin_shufflevector(__ret_41, __ret_41, 1, 0); \
   __ret_41; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s16(__p0_42, __p1_42, __p2_42, __p3_42) __extension__ ({ \
-  int16x4_t __s0_42 = __p0_42; \
-  int16x4_t __s2_42 = __p2_42; \
-  int16x4_t __ret_42; \
-  __ret_42 = vset_lane_s16(vget_lane_s16(__s2_42, __p3_42), __s0_42, __p1_42); \
+#define vcopy_lane_u64(__p0_42, __p1_42, __p2_42, __p3_42) __extension__ ({ \
+  uint64x1_t __s0_42 = __p0_42; \
+  uint64x1_t __s2_42 = __p2_42; \
+  uint64x1_t __ret_42; \
+  __ret_42 = vset_lane_u64(vget_lane_u64(__s2_42, __p3_42), __s0_42, __p1_42); \
   __ret_42; \
 })
 #else
-#define vcopy_lane_s16(__p0_43, __p1_43, __p2_43, __p3_43) __extension__ ({ \
-  int16x4_t __s0_43 = __p0_43; \
-  int16x4_t __s2_43 = __p2_43; \
-  int16x4_t __rev0_43;  __rev0_43 = __builtin_shufflevector(__s0_43, __s0_43, 3, 2, 1, 0); \
-  int16x4_t __rev2_43;  __rev2_43 = __builtin_shufflevector(__s2_43, __s2_43, 3, 2, 1, 0); \
-  int16x4_t __ret_43; \
-  __ret_43 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_43, __p3_43), __rev0_43, __p1_43); \
-  __ret_43 = __builtin_shufflevector(__ret_43, __ret_43, 3, 2, 1, 0); \
+#define vcopy_lane_u64(__p0_43, __p1_43, __p2_43, __p3_43) __extension__ ({ \
+  uint64x1_t __s0_43 = __p0_43; \
+  uint64x1_t __s2_43 = __p2_43; \
+  uint64x1_t __ret_43; \
+  __ret_43 = __noswap_vset_lane_u64(__noswap_vget_lane_u64(__s2_43, __p3_43), __s0_43, __p1_43); \
   __ret_43; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p8(__p0_44, __p1_44, __p2_44, __p3_44) __extension__ ({ \
-  poly8x16_t __s0_44 = __p0_44; \
-  poly8x16_t __s2_44 = __p2_44; \
-  poly8x16_t __ret_44; \
-  __ret_44 = vsetq_lane_p8(vgetq_lane_p8(__s2_44, __p3_44), __s0_44, __p1_44); \
+#define vcopy_lane_u16(__p0_44, __p1_44, __p2_44, __p3_44) __extension__ ({ \
+  uint16x4_t __s0_44 = __p0_44; \
+  uint16x4_t __s2_44 = __p2_44; \
+  uint16x4_t __ret_44; \
+  __ret_44 = vset_lane_u16(vget_lane_u16(__s2_44, __p3_44), __s0_44, __p1_44); \
   __ret_44; \
 })
 #else
-#define vcopyq_laneq_p8(__p0_45, __p1_45, __p2_45, __p3_45) __extension__ ({ \
-  poly8x16_t __s0_45 = __p0_45; \
-  poly8x16_t __s2_45 = __p2_45; \
-  poly8x16_t __rev0_45;  __rev0_45 = __builtin_shufflevector(__s0_45, __s0_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev2_45;  __rev2_45 = __builtin_shufflevector(__s2_45, __s2_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __ret_45; \
-  __ret_45 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_45, __p3_45), __rev0_45, __p1_45); \
-  __ret_45 = __builtin_shufflevector(__ret_45, __ret_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vcopy_lane_u16(__p0_45, __p1_45, __p2_45, __p3_45) __extension__ ({ \
+  uint16x4_t __s0_45 = __p0_45; \
+  uint16x4_t __s2_45 = __p2_45; \
+  uint16x4_t __rev0_45;  __rev0_45 = __builtin_shufflevector(__s0_45, __s0_45, 3, 2, 1, 0); \
+  uint16x4_t __rev2_45;  __rev2_45 = __builtin_shufflevector(__s2_45, __s2_45, 3, 2, 1, 0); \
+  uint16x4_t __ret_45; \
+  __ret_45 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_45, __p3_45), __rev0_45, __p1_45); \
+  __ret_45 = __builtin_shufflevector(__ret_45, __ret_45, 3, 2, 1, 0); \
   __ret_45; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p16(__p0_46, __p1_46, __p2_46, __p3_46) __extension__ ({ \
-  poly16x8_t __s0_46 = __p0_46; \
-  poly16x8_t __s2_46 = __p2_46; \
-  poly16x8_t __ret_46; \
-  __ret_46 = vsetq_lane_p16(vgetq_lane_p16(__s2_46, __p3_46), __s0_46, __p1_46); \
+#define vcopy_lane_s8(__p0_46, __p1_46, __p2_46, __p3_46) __extension__ ({ \
+  int8x8_t __s0_46 = __p0_46; \
+  int8x8_t __s2_46 = __p2_46; \
+  int8x8_t __ret_46; \
+  __ret_46 = vset_lane_s8(vget_lane_s8(__s2_46, __p3_46), __s0_46, __p1_46); \
   __ret_46; \
 })
 #else
-#define vcopyq_laneq_p16(__p0_47, __p1_47, __p2_47, __p3_47) __extension__ ({ \
-  poly16x8_t __s0_47 = __p0_47; \
-  poly16x8_t __s2_47 = __p2_47; \
-  poly16x8_t __rev0_47;  __rev0_47 = __builtin_shufflevector(__s0_47, __s0_47, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __rev2_47;  __rev2_47 = __builtin_shufflevector(__s2_47, __s2_47, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __ret_47; \
-  __ret_47 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_47, __p3_47), __rev0_47, __p1_47); \
+#define vcopy_lane_s8(__p0_47, __p1_47, __p2_47, __p3_47) __extension__ ({ \
+  int8x8_t __s0_47 = __p0_47; \
+  int8x8_t __s2_47 = __p2_47; \
+  int8x8_t __rev0_47;  __rev0_47 = __builtin_shufflevector(__s0_47, __s0_47, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_47;  __rev2_47 = __builtin_shufflevector(__s2_47, __s2_47, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_47; \
+  __ret_47 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_47, __p3_47), __rev0_47, __p1_47); \
   __ret_47 = __builtin_shufflevector(__ret_47, __ret_47, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_47; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u8(__p0_48, __p1_48, __p2_48, __p3_48) __extension__ ({ \
-  uint8x16_t __s0_48 = __p0_48; \
-  uint8x16_t __s2_48 = __p2_48; \
-  uint8x16_t __ret_48; \
-  __ret_48 = vsetq_lane_u8(vgetq_lane_u8(__s2_48, __p3_48), __s0_48, __p1_48); \
+#define vcopy_lane_f32(__p0_48, __p1_48, __p2_48, __p3_48) __extension__ ({ \
+  float32x2_t __s0_48 = __p0_48; \
+  float32x2_t __s2_48 = __p2_48; \
+  float32x2_t __ret_48; \
+  __ret_48 = vset_lane_f32(vget_lane_f32(__s2_48, __p3_48), __s0_48, __p1_48); \
   __ret_48; \
 })
 #else
-#define vcopyq_laneq_u8(__p0_49, __p1_49, __p2_49, __p3_49) __extension__ ({ \
-  uint8x16_t __s0_49 = __p0_49; \
-  uint8x16_t __s2_49 = __p2_49; \
-  uint8x16_t __rev0_49;  __rev0_49 = __builtin_shufflevector(__s0_49, __s0_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_49;  __rev2_49 = __builtin_shufflevector(__s2_49, __s2_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_49; \
-  __ret_49 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_49, __p3_49), __rev0_49, __p1_49); \
-  __ret_49 = __builtin_shufflevector(__ret_49, __ret_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vcopy_lane_f32(__p0_49, __p1_49, __p2_49, __p3_49) __extension__ ({ \
+  float32x2_t __s0_49 = __p0_49; \
+  float32x2_t __s2_49 = __p2_49; \
+  float32x2_t __rev0_49;  __rev0_49 = __builtin_shufflevector(__s0_49, __s0_49, 1, 0); \
+  float32x2_t __rev2_49;  __rev2_49 = __builtin_shufflevector(__s2_49, __s2_49, 1, 0); \
+  float32x2_t __ret_49; \
+  __ret_49 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_49, __p3_49), __rev0_49, __p1_49); \
+  __ret_49 = __builtin_shufflevector(__ret_49, __ret_49, 1, 0); \
   __ret_49; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u32(__p0_50, __p1_50, __p2_50, __p3_50) __extension__ ({ \
-  uint32x4_t __s0_50 = __p0_50; \
-  uint32x4_t __s2_50 = __p2_50; \
-  uint32x4_t __ret_50; \
-  __ret_50 = vsetq_lane_u32(vgetq_lane_u32(__s2_50, __p3_50), __s0_50, __p1_50); \
+#define vcopy_lane_s32(__p0_50, __p1_50, __p2_50, __p3_50) __extension__ ({ \
+  int32x2_t __s0_50 = __p0_50; \
+  int32x2_t __s2_50 = __p2_50; \
+  int32x2_t __ret_50; \
+  __ret_50 = vset_lane_s32(vget_lane_s32(__s2_50, __p3_50), __s0_50, __p1_50); \
   __ret_50; \
 })
 #else
-#define vcopyq_laneq_u32(__p0_51, __p1_51, __p2_51, __p3_51) __extension__ ({ \
-  uint32x4_t __s0_51 = __p0_51; \
-  uint32x4_t __s2_51 = __p2_51; \
-  uint32x4_t __rev0_51;  __rev0_51 = __builtin_shufflevector(__s0_51, __s0_51, 3, 2, 1, 0); \
-  uint32x4_t __rev2_51;  __rev2_51 = __builtin_shufflevector(__s2_51, __s2_51, 3, 2, 1, 0); \
-  uint32x4_t __ret_51; \
-  __ret_51 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_51, __p3_51), __rev0_51, __p1_51); \
-  __ret_51 = __builtin_shufflevector(__ret_51, __ret_51, 3, 2, 1, 0); \
+#define vcopy_lane_s32(__p0_51, __p1_51, __p2_51, __p3_51) __extension__ ({ \
+  int32x2_t __s0_51 = __p0_51; \
+  int32x2_t __s2_51 = __p2_51; \
+  int32x2_t __rev0_51;  __rev0_51 = __builtin_shufflevector(__s0_51, __s0_51, 1, 0); \
+  int32x2_t __rev2_51;  __rev2_51 = __builtin_shufflevector(__s2_51, __s2_51, 1, 0); \
+  int32x2_t __ret_51; \
+  __ret_51 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_51, __p3_51), __rev0_51, __p1_51); \
+  __ret_51 = __builtin_shufflevector(__ret_51, __ret_51, 1, 0); \
   __ret_51; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u64(__p0_52, __p1_52, __p2_52, __p3_52) __extension__ ({ \
-  uint64x2_t __s0_52 = __p0_52; \
-  uint64x2_t __s2_52 = __p2_52; \
-  uint64x2_t __ret_52; \
-  __ret_52 = vsetq_lane_u64(vgetq_lane_u64(__s2_52, __p3_52), __s0_52, __p1_52); \
+#define vcopy_lane_s64(__p0_52, __p1_52, __p2_52, __p3_52) __extension__ ({ \
+  int64x1_t __s0_52 = __p0_52; \
+  int64x1_t __s2_52 = __p2_52; \
+  int64x1_t __ret_52; \
+  __ret_52 = vset_lane_s64(vget_lane_s64(__s2_52, __p3_52), __s0_52, __p1_52); \
   __ret_52; \
 })
 #else
-#define vcopyq_laneq_u64(__p0_53, __p1_53, __p2_53, __p3_53) __extension__ ({ \
-  uint64x2_t __s0_53 = __p0_53; \
-  uint64x2_t __s2_53 = __p2_53; \
-  uint64x2_t __rev0_53;  __rev0_53 = __builtin_shufflevector(__s0_53, __s0_53, 1, 0); \
-  uint64x2_t __rev2_53;  __rev2_53 = __builtin_shufflevector(__s2_53, __s2_53, 1, 0); \
-  uint64x2_t __ret_53; \
-  __ret_53 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_53, __p3_53), __rev0_53, __p1_53); \
-  __ret_53 = __builtin_shufflevector(__ret_53, __ret_53, 1, 0); \
+#define vcopy_lane_s64(__p0_53, __p1_53, __p2_53, __p3_53) __extension__ ({ \
+  int64x1_t __s0_53 = __p0_53; \
+  int64x1_t __s2_53 = __p2_53; \
+  int64x1_t __ret_53; \
+  __ret_53 = __noswap_vset_lane_s64(__noswap_vget_lane_s64(__s2_53, __p3_53), __s0_53, __p1_53); \
   __ret_53; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u16(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \
-  uint16x8_t __s0_54 = __p0_54; \
-  uint16x8_t __s2_54 = __p2_54; \
-  uint16x8_t __ret_54; \
-  __ret_54 = vsetq_lane_u16(vgetq_lane_u16(__s2_54, __p3_54), __s0_54, __p1_54); \
+#define vcopy_lane_s16(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \
+  int16x4_t __s0_54 = __p0_54; \
+  int16x4_t __s2_54 = __p2_54; \
+  int16x4_t __ret_54; \
+  __ret_54 = vset_lane_s16(vget_lane_s16(__s2_54, __p3_54), __s0_54, __p1_54); \
   __ret_54; \
 })
 #else
-#define vcopyq_laneq_u16(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \
-  uint16x8_t __s0_55 = __p0_55; \
-  uint16x8_t __s2_55 = __p2_55; \
-  uint16x8_t __rev0_55;  __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_55;  __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __ret_55; \
-  __ret_55 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_55, __p3_55), __rev0_55, __p1_55); \
-  __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vcopy_lane_s16(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \
+  int16x4_t __s0_55 = __p0_55; \
+  int16x4_t __s2_55 = __p2_55; \
+  int16x4_t __rev0_55;  __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 3, 2, 1, 0); \
+  int16x4_t __rev2_55;  __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 3, 2, 1, 0); \
+  int16x4_t __ret_55; \
+  __ret_55 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_55, __p3_55), __rev0_55, __p1_55); \
+  __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 3, 2, 1, 0); \
   __ret_55; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s8(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \
-  int8x16_t __s0_56 = __p0_56; \
-  int8x16_t __s2_56 = __p2_56; \
-  int8x16_t __ret_56; \
-  __ret_56 = vsetq_lane_s8(vgetq_lane_s8(__s2_56, __p3_56), __s0_56, __p1_56); \
+#define vcopyq_laneq_p8(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \
+  poly8x16_t __s0_56 = __p0_56; \
+  poly8x16_t __s2_56 = __p2_56; \
+  poly8x16_t __ret_56; \
+  __ret_56 = vsetq_lane_p8(vgetq_lane_p8(__s2_56, __p3_56), __s0_56, __p1_56); \
   __ret_56; \
 })
 #else
-#define vcopyq_laneq_s8(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \
-  int8x16_t __s0_57 = __p0_57; \
-  int8x16_t __s2_57 = __p2_57; \
-  int8x16_t __rev0_57;  __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_57;  __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_57; \
-  __ret_57 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_57, __p3_57), __rev0_57, __p1_57); \
+#define vcopyq_laneq_p8(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \
+  poly8x16_t __s0_57 = __p0_57; \
+  poly8x16_t __s2_57 = __p2_57; \
+  poly8x16_t __rev0_57;  __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev2_57;  __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_57; \
+  __ret_57 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_57, __p3_57), __rev0_57, __p1_57); \
   __ret_57 = __builtin_shufflevector(__ret_57, __ret_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_57; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_f32(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \
-  float32x4_t __s0_58 = __p0_58; \
-  float32x4_t __s2_58 = __p2_58; \
-  float32x4_t __ret_58; \
-  __ret_58 = vsetq_lane_f32(vgetq_lane_f32(__s2_58, __p3_58), __s0_58, __p1_58); \
+#define vcopyq_laneq_p16(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \
+  poly16x8_t __s0_58 = __p0_58; \
+  poly16x8_t __s2_58 = __p2_58; \
+  poly16x8_t __ret_58; \
+  __ret_58 = vsetq_lane_p16(vgetq_lane_p16(__s2_58, __p3_58), __s0_58, __p1_58); \
   __ret_58; \
 })
 #else
-#define vcopyq_laneq_f32(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \
-  float32x4_t __s0_59 = __p0_59; \
-  float32x4_t __s2_59 = __p2_59; \
-  float32x4_t __rev0_59;  __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 3, 2, 1, 0); \
-  float32x4_t __rev2_59;  __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 3, 2, 1, 0); \
-  float32x4_t __ret_59; \
-  __ret_59 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_59, __p3_59), __rev0_59, __p1_59); \
-  __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 3, 2, 1, 0); \
+#define vcopyq_laneq_p16(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \
+  poly16x8_t __s0_59 = __p0_59; \
+  poly16x8_t __s2_59 = __p2_59; \
+  poly16x8_t __rev0_59;  __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev2_59;  __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret_59; \
+  __ret_59 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_59, __p3_59), __rev0_59, __p1_59); \
+  __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_59; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s32(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \
-  int32x4_t __s0_60 = __p0_60; \
-  int32x4_t __s2_60 = __p2_60; \
-  int32x4_t __ret_60; \
-  __ret_60 = vsetq_lane_s32(vgetq_lane_s32(__s2_60, __p3_60), __s0_60, __p1_60); \
+#define vcopyq_laneq_u8(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \
+  uint8x16_t __s0_60 = __p0_60; \
+  uint8x16_t __s2_60 = __p2_60; \
+  uint8x16_t __ret_60; \
+  __ret_60 = vsetq_lane_u8(vgetq_lane_u8(__s2_60, __p3_60), __s0_60, __p1_60); \
   __ret_60; \
 })
 #else
-#define vcopyq_laneq_s32(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \
-  int32x4_t __s0_61 = __p0_61; \
-  int32x4_t __s2_61 = __p2_61; \
-  int32x4_t __rev0_61;  __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 3, 2, 1, 0); \
-  int32x4_t __rev2_61;  __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 3, 2, 1, 0); \
-  int32x4_t __ret_61; \
-  __ret_61 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_61, __p3_61), __rev0_61, __p1_61); \
-  __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 3, 2, 1, 0); \
+#define vcopyq_laneq_u8(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \
+  uint8x16_t __s0_61 = __p0_61; \
+  uint8x16_t __s2_61 = __p2_61; \
+  uint8x16_t __rev0_61;  __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_61;  __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_61; \
+  __ret_61 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_61, __p3_61), __rev0_61, __p1_61); \
+  __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_61; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s64(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \
-  int64x2_t __s0_62 = __p0_62; \
-  int64x2_t __s2_62 = __p2_62; \
-  int64x2_t __ret_62; \
-  __ret_62 = vsetq_lane_s64(vgetq_lane_s64(__s2_62, __p3_62), __s0_62, __p1_62); \
+#define vcopyq_laneq_u32(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \
+  uint32x4_t __s0_62 = __p0_62; \
+  uint32x4_t __s2_62 = __p2_62; \
+  uint32x4_t __ret_62; \
+  __ret_62 = vsetq_lane_u32(vgetq_lane_u32(__s2_62, __p3_62), __s0_62, __p1_62); \
   __ret_62; \
 })
 #else
-#define vcopyq_laneq_s64(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \
-  int64x2_t __s0_63 = __p0_63; \
-  int64x2_t __s2_63 = __p2_63; \
-  int64x2_t __rev0_63;  __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 1, 0); \
-  int64x2_t __rev2_63;  __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 1, 0); \
-  int64x2_t __ret_63; \
-  __ret_63 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_63, __p3_63), __rev0_63, __p1_63); \
-  __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 1, 0); \
+#define vcopyq_laneq_u32(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \
+  uint32x4_t __s0_63 = __p0_63; \
+  uint32x4_t __s2_63 = __p2_63; \
+  uint32x4_t __rev0_63;  __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 3, 2, 1, 0); \
+  uint32x4_t __rev2_63;  __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 3, 2, 1, 0); \
+  uint32x4_t __ret_63; \
+  __ret_63 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_63, __p3_63), __rev0_63, __p1_63); \
+  __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 3, 2, 1, 0); \
   __ret_63; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s16(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \
-  int16x8_t __s0_64 = __p0_64; \
-  int16x8_t __s2_64 = __p2_64; \
-  int16x8_t __ret_64; \
-  __ret_64 = vsetq_lane_s16(vgetq_lane_s16(__s2_64, __p3_64), __s0_64, __p1_64); \
+#define vcopyq_laneq_u64(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \
+  uint64x2_t __s0_64 = __p0_64; \
+  uint64x2_t __s2_64 = __p2_64; \
+  uint64x2_t __ret_64; \
+  __ret_64 = vsetq_lane_u64(vgetq_lane_u64(__s2_64, __p3_64), __s0_64, __p1_64); \
   __ret_64; \
 })
 #else
-#define vcopyq_laneq_s16(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \
-  int16x8_t __s0_65 = __p0_65; \
-  int16x8_t __s2_65 = __p2_65; \
-  int16x8_t __rev0_65;  __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_65;  __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_65; \
-  __ret_65 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_65, __p3_65), __rev0_65, __p1_65); \
-  __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vcopyq_laneq_u64(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \
+  uint64x2_t __s0_65 = __p0_65; \
+  uint64x2_t __s2_65 = __p2_65; \
+  uint64x2_t __rev0_65;  __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 1, 0); \
+  uint64x2_t __rev2_65;  __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 1, 0); \
+  uint64x2_t __ret_65; \
+  __ret_65 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_65, __p3_65), __rev0_65, __p1_65); \
+  __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 1, 0); \
   __ret_65; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p8(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \
-  poly8x8_t __s0_66 = __p0_66; \
-  poly8x16_t __s2_66 = __p2_66; \
-  poly8x8_t __ret_66; \
-  __ret_66 = vset_lane_p8(vgetq_lane_p8(__s2_66, __p3_66), __s0_66, __p1_66); \
+#define vcopyq_laneq_u16(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \
+  uint16x8_t __s0_66 = __p0_66; \
+  uint16x8_t __s2_66 = __p2_66; \
+  uint16x8_t __ret_66; \
+  __ret_66 = vsetq_lane_u16(vgetq_lane_u16(__s2_66, __p3_66), __s0_66, __p1_66); \
   __ret_66; \
 })
 #else
-#define vcopy_laneq_p8(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \
-  poly8x8_t __s0_67 = __p0_67; \
-  poly8x16_t __s2_67 = __p2_67; \
-  poly8x8_t __rev0_67;  __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev2_67;  __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __ret_67; \
-  __ret_67 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_67, __p3_67), __rev0_67, __p1_67); \
+#define vcopyq_laneq_u16(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \
+  uint16x8_t __s0_67 = __p0_67; \
+  uint16x8_t __s2_67 = __p2_67; \
+  uint16x8_t __rev0_67;  __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2_67;  __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_67; \
+  __ret_67 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_67, __p3_67), __rev0_67, __p1_67); \
   __ret_67 = __builtin_shufflevector(__ret_67, __ret_67, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_67; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p16(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \
-  poly16x4_t __s0_68 = __p0_68; \
-  poly16x8_t __s2_68 = __p2_68; \
-  poly16x4_t __ret_68; \
-  __ret_68 = vset_lane_p16(vgetq_lane_p16(__s2_68, __p3_68), __s0_68, __p1_68); \
+#define vcopyq_laneq_s8(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \
+  int8x16_t __s0_68 = __p0_68; \
+  int8x16_t __s2_68 = __p2_68; \
+  int8x16_t __ret_68; \
+  __ret_68 = vsetq_lane_s8(vgetq_lane_s8(__s2_68, __p3_68), __s0_68, __p1_68); \
   __ret_68; \
 })
 #else
-#define vcopy_laneq_p16(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \
-  poly16x4_t __s0_69 = __p0_69; \
-  poly16x8_t __s2_69 = __p2_69; \
-  poly16x4_t __rev0_69;  __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 3, 2, 1, 0); \
-  poly16x8_t __rev2_69;  __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x4_t __ret_69; \
-  __ret_69 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_69, __p3_69), __rev0_69, __p1_69); \
-  __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 3, 2, 1, 0); \
+#define vcopyq_laneq_s8(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \
+  int8x16_t __s0_69 = __p0_69; \
+  int8x16_t __s2_69 = __p2_69; \
+  int8x16_t __rev0_69;  __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_69;  __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_69; \
+  __ret_69 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_69, __p3_69), __rev0_69, __p1_69); \
+  __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_69; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u8(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \
-  uint8x8_t __s0_70 = __p0_70; \
-  uint8x16_t __s2_70 = __p2_70; \
-  uint8x8_t __ret_70; \
-  __ret_70 = vset_lane_u8(vgetq_lane_u8(__s2_70, __p3_70), __s0_70, __p1_70); \
+#define vcopyq_laneq_f32(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \
+  float32x4_t __s0_70 = __p0_70; \
+  float32x4_t __s2_70 = __p2_70; \
+  float32x4_t __ret_70; \
+  __ret_70 = vsetq_lane_f32(vgetq_lane_f32(__s2_70, __p3_70), __s0_70, __p1_70); \
   __ret_70; \
 })
 #else
-#define vcopy_laneq_u8(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \
-  uint8x8_t __s0_71 = __p0_71; \
-  uint8x16_t __s2_71 = __p2_71; \
-  uint8x8_t __rev0_71;  __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_71;  __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __ret_71; \
-  __ret_71 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_71, __p3_71), __rev0_71, __p1_71); \
-  __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vcopyq_laneq_f32(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \
+  float32x4_t __s0_71 = __p0_71; \
+  float32x4_t __s2_71 = __p2_71; \
+  float32x4_t __rev0_71;  __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 3, 2, 1, 0); \
+  float32x4_t __rev2_71;  __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 3, 2, 1, 0); \
+  float32x4_t __ret_71; \
+  __ret_71 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_71, __p3_71), __rev0_71, __p1_71); \
+  __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 3, 2, 1, 0); \
   __ret_71; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u32(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \
-  uint32x2_t __s0_72 = __p0_72; \
-  uint32x4_t __s2_72 = __p2_72; \
-  uint32x2_t __ret_72; \
-  __ret_72 = vset_lane_u32(vgetq_lane_u32(__s2_72, __p3_72), __s0_72, __p1_72); \
+#define vcopyq_laneq_s32(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \
+  int32x4_t __s0_72 = __p0_72; \
+  int32x4_t __s2_72 = __p2_72; \
+  int32x4_t __ret_72; \
+  __ret_72 = vsetq_lane_s32(vgetq_lane_s32(__s2_72, __p3_72), __s0_72, __p1_72); \
   __ret_72; \
 })
 #else
-#define vcopy_laneq_u32(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \
-  uint32x2_t __s0_73 = __p0_73; \
-  uint32x4_t __s2_73 = __p2_73; \
-  uint32x2_t __rev0_73;  __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 1, 0); \
-  uint32x4_t __rev2_73;  __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \
-  uint32x2_t __ret_73; \
-  __ret_73 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_73, __p3_73), __rev0_73, __p1_73); \
-  __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 1, 0); \
+#define vcopyq_laneq_s32(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \
+  int32x4_t __s0_73 = __p0_73; \
+  int32x4_t __s2_73 = __p2_73; \
+  int32x4_t __rev0_73;  __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 3, 2, 1, 0); \
+  int32x4_t __rev2_73;  __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \
+  int32x4_t __ret_73; \
+  __ret_73 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_73, __p3_73), __rev0_73, __p1_73); \
+  __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 3, 2, 1, 0); \
   __ret_73; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u64(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \
-  uint64x1_t __s0_74 = __p0_74; \
-  uint64x2_t __s2_74 = __p2_74; \
-  uint64x1_t __ret_74; \
-  __ret_74 = vset_lane_u64(vgetq_lane_u64(__s2_74, __p3_74), __s0_74, __p1_74); \
+#define vcopyq_laneq_s64(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \
+  int64x2_t __s0_74 = __p0_74; \
+  int64x2_t __s2_74 = __p2_74; \
+  int64x2_t __ret_74; \
+  __ret_74 = vsetq_lane_s64(vgetq_lane_s64(__s2_74, __p3_74), __s0_74, __p1_74); \
   __ret_74; \
 })
 #else
-#define vcopy_laneq_u64(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \
-  uint64x1_t __s0_75 = __p0_75; \
-  uint64x2_t __s2_75 = __p2_75; \
-  uint64x2_t __rev2_75;  __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \
-  uint64x1_t __ret_75; \
-  __ret_75 = __noswap_vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_75, __p3_75), __s0_75, __p1_75); \
+#define vcopyq_laneq_s64(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \
+  int64x2_t __s0_75 = __p0_75; \
+  int64x2_t __s2_75 = __p2_75; \
+  int64x2_t __rev0_75;  __rev0_75 = __builtin_shufflevector(__s0_75, __s0_75, 1, 0); \
+  int64x2_t __rev2_75;  __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \
+  int64x2_t __ret_75; \
+  __ret_75 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_75, __p3_75), __rev0_75, __p1_75); \
+  __ret_75 = __builtin_shufflevector(__ret_75, __ret_75, 1, 0); \
   __ret_75; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \
-  uint16x4_t __s0_76 = __p0_76; \
-  uint16x8_t __s2_76 = __p2_76; \
-  uint16x4_t __ret_76; \
-  __ret_76 = vset_lane_u16(vgetq_lane_u16(__s2_76, __p3_76), __s0_76, __p1_76); \
+#define vcopyq_laneq_s16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \
+  int16x8_t __s0_76 = __p0_76; \
+  int16x8_t __s2_76 = __p2_76; \
+  int16x8_t __ret_76; \
+  __ret_76 = vsetq_lane_s16(vgetq_lane_s16(__s2_76, __p3_76), __s0_76, __p1_76); \
   __ret_76; \
 })
 #else
-#define vcopy_laneq_u16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \
-  uint16x4_t __s0_77 = __p0_77; \
-  uint16x8_t __s2_77 = __p2_77; \
-  uint16x4_t __rev0_77;  __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 3, 2, 1, 0); \
-  uint16x8_t __rev2_77;  __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __ret_77; \
-  __ret_77 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_77, __p3_77), __rev0_77, __p1_77); \
-  __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 3, 2, 1, 0); \
+#define vcopyq_laneq_s16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \
+  int16x8_t __s0_77 = __p0_77; \
+  int16x8_t __s2_77 = __p2_77; \
+  int16x8_t __rev0_77;  __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_77;  __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_77; \
+  __ret_77 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_77, __p3_77), __rev0_77, __p1_77); \
+  __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_77; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s8(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \
-  int8x8_t __s0_78 = __p0_78; \
-  int8x16_t __s2_78 = __p2_78; \
-  int8x8_t __ret_78; \
-  __ret_78 = vset_lane_s8(vgetq_lane_s8(__s2_78, __p3_78), __s0_78, __p1_78); \
+#define vcopy_laneq_p8(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \
+  poly8x8_t __s0_78 = __p0_78; \
+  poly8x16_t __s2_78 = __p2_78; \
+  poly8x8_t __ret_78; \
+  __ret_78 = vset_lane_p8(vgetq_lane_p8(__s2_78, __p3_78), __s0_78, __p1_78); \
   __ret_78; \
 })
 #else
-#define vcopy_laneq_s8(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \
-  int8x8_t __s0_79 = __p0_79; \
-  int8x16_t __s2_79 = __p2_79; \
-  int8x8_t __rev0_79;  __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_79;  __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __ret_79; \
-  __ret_79 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_79, __p3_79), __rev0_79, __p1_79); \
+#define vcopy_laneq_p8(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \
+  poly8x8_t __s0_79 = __p0_79; \
+  poly8x16_t __s2_79 = __p2_79; \
+  poly8x8_t __rev0_79;  __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev2_79;  __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_79; \
+  __ret_79 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_79, __p3_79), __rev0_79, __p1_79); \
   __ret_79 = __builtin_shufflevector(__ret_79, __ret_79, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_79; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_f32(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \
-  float32x2_t __s0_80 = __p0_80; \
-  float32x4_t __s2_80 = __p2_80; \
-  float32x2_t __ret_80; \
-  __ret_80 = vset_lane_f32(vgetq_lane_f32(__s2_80, __p3_80), __s0_80, __p1_80); \
+#define vcopy_laneq_p16(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \
+  poly16x4_t __s0_80 = __p0_80; \
+  poly16x8_t __s2_80 = __p2_80; \
+  poly16x4_t __ret_80; \
+  __ret_80 = vset_lane_p16(vgetq_lane_p16(__s2_80, __p3_80), __s0_80, __p1_80); \
   __ret_80; \
 })
 #else
-#define vcopy_laneq_f32(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \
-  float32x2_t __s0_81 = __p0_81; \
-  float32x4_t __s2_81 = __p2_81; \
-  float32x2_t __rev0_81;  __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 1, 0); \
-  float32x4_t __rev2_81;  __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 3, 2, 1, 0); \
-  float32x2_t __ret_81; \
-  __ret_81 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_81, __p3_81), __rev0_81, __p1_81); \
-  __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 1, 0); \
+#define vcopy_laneq_p16(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \
+  poly16x4_t __s0_81 = __p0_81; \
+  poly16x8_t __s2_81 = __p2_81; \
+  poly16x4_t __rev0_81;  __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 3, 2, 1, 0); \
+  poly16x8_t __rev2_81;  __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __ret_81; \
+  __ret_81 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_81, __p3_81), __rev0_81, __p1_81); \
+  __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 3, 2, 1, 0); \
   __ret_81; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s32(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \
-  int32x2_t __s0_82 = __p0_82; \
-  int32x4_t __s2_82 = __p2_82; \
-  int32x2_t __ret_82; \
-  __ret_82 = vset_lane_s32(vgetq_lane_s32(__s2_82, __p3_82), __s0_82, __p1_82); \
+#define vcopy_laneq_u8(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \
+  uint8x8_t __s0_82 = __p0_82; \
+  uint8x16_t __s2_82 = __p2_82; \
+  uint8x8_t __ret_82; \
+  __ret_82 = vset_lane_u8(vgetq_lane_u8(__s2_82, __p3_82), __s0_82, __p1_82); \
   __ret_82; \
 })
 #else
-#define vcopy_laneq_s32(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \
-  int32x2_t __s0_83 = __p0_83; \
-  int32x4_t __s2_83 = __p2_83; \
-  int32x2_t __rev0_83;  __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 1, 0); \
-  int32x4_t __rev2_83;  __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 3, 2, 1, 0); \
-  int32x2_t __ret_83; \
-  __ret_83 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_83, __p3_83), __rev0_83, __p1_83); \
-  __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 1, 0); \
+#define vcopy_laneq_u8(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \
+  uint8x8_t __s0_83 = __p0_83; \
+  uint8x16_t __s2_83 = __p2_83; \
+  uint8x8_t __rev0_83;  __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_83;  __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_83; \
+  __ret_83 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_83, __p3_83), __rev0_83, __p1_83); \
+  __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_83; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s64(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \
-  int64x1_t __s0_84 = __p0_84; \
-  int64x2_t __s2_84 = __p2_84; \
-  int64x1_t __ret_84; \
-  __ret_84 = vset_lane_s64(vgetq_lane_s64(__s2_84, __p3_84), __s0_84, __p1_84); \
+#define vcopy_laneq_u32(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \
+  uint32x2_t __s0_84 = __p0_84; \
+  uint32x4_t __s2_84 = __p2_84; \
+  uint32x2_t __ret_84; \
+  __ret_84 = vset_lane_u32(vgetq_lane_u32(__s2_84, __p3_84), __s0_84, __p1_84); \
   __ret_84; \
 })
 #else
-#define vcopy_laneq_s64(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \
-  int64x1_t __s0_85 = __p0_85; \
-  int64x2_t __s2_85 = __p2_85; \
-  int64x2_t __rev2_85;  __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 1, 0); \
-  int64x1_t __ret_85; \
-  __ret_85 = __noswap_vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_85, __p3_85), __s0_85, __p1_85); \
+#define vcopy_laneq_u32(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \
+  uint32x2_t __s0_85 = __p0_85; \
+  uint32x4_t __s2_85 = __p2_85; \
+  uint32x2_t __rev0_85;  __rev0_85 = __builtin_shufflevector(__s0_85, __s0_85, 1, 0); \
+  uint32x4_t __rev2_85;  __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 3, 2, 1, 0); \
+  uint32x2_t __ret_85; \
+  __ret_85 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_85, __p3_85), __rev0_85, __p1_85); \
+  __ret_85 = __builtin_shufflevector(__ret_85, __ret_85, 1, 0); \
   __ret_85; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s16(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \
-  int16x4_t __s0_86 = __p0_86; \
-  int16x8_t __s2_86 = __p2_86; \
-  int16x4_t __ret_86; \
-  __ret_86 = vset_lane_s16(vgetq_lane_s16(__s2_86, __p3_86), __s0_86, __p1_86); \
+#define vcopy_laneq_u64(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \
+  uint64x1_t __s0_86 = __p0_86; \
+  uint64x2_t __s2_86 = __p2_86; \
+  uint64x1_t __ret_86; \
+  __ret_86 = vset_lane_u64(vgetq_lane_u64(__s2_86, __p3_86), __s0_86, __p1_86); \
   __ret_86; \
 })
 #else
-#define vcopy_laneq_s16(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \
-  int16x4_t __s0_87 = __p0_87; \
-  int16x8_t __s2_87 = __p2_87; \
-  int16x4_t __rev0_87;  __rev0_87 = __builtin_shufflevector(__s0_87, __s0_87, 3, 2, 1, 0); \
-  int16x8_t __rev2_87;  __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __ret_87; \
-  __ret_87 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_87, __p3_87), __rev0_87, __p1_87); \
-  __ret_87 = __builtin_shufflevector(__ret_87, __ret_87, 3, 2, 1, 0); \
+#define vcopy_laneq_u64(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \
+  uint64x1_t __s0_87 = __p0_87; \
+  uint64x2_t __s2_87 = __p2_87; \
+  uint64x2_t __rev2_87;  __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 1, 0); \
+  uint64x1_t __ret_87; \
+  __ret_87 = __noswap_vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_87, __p3_87), __s0_87, __p1_87); \
   __ret_87; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u16(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \
+  uint16x4_t __s0_88 = __p0_88; \
+  uint16x8_t __s2_88 = __p2_88; \
+  uint16x4_t __ret_88; \
+  __ret_88 = vset_lane_u16(vgetq_lane_u16(__s2_88, __p3_88), __s0_88, __p1_88); \
+  __ret_88; \
+})
+#else
+#define vcopy_laneq_u16(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \
+  uint16x4_t __s0_89 = __p0_89; \
+  uint16x8_t __s2_89 = __p2_89; \
+  uint16x4_t __rev0_89;  __rev0_89 = __builtin_shufflevector(__s0_89, __s0_89, 3, 2, 1, 0); \
+  uint16x8_t __rev2_89;  __rev2_89 = __builtin_shufflevector(__s2_89, __s2_89, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret_89; \
+  __ret_89 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_89, __p3_89), __rev0_89, __p1_89); \
+  __ret_89 = __builtin_shufflevector(__ret_89, __ret_89, 3, 2, 1, 0); \
+  __ret_89; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s8(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \
+  int8x8_t __s0_90 = __p0_90; \
+  int8x16_t __s2_90 = __p2_90; \
+  int8x8_t __ret_90; \
+  __ret_90 = vset_lane_s8(vgetq_lane_s8(__s2_90, __p3_90), __s0_90, __p1_90); \
+  __ret_90; \
+})
+#else
+#define vcopy_laneq_s8(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \
+  int8x8_t __s0_91 = __p0_91; \
+  int8x16_t __s2_91 = __p2_91; \
+  int8x8_t __rev0_91;  __rev0_91 = __builtin_shufflevector(__s0_91, __s0_91, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_91;  __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_91; \
+  __ret_91 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_91, __p3_91), __rev0_91, __p1_91); \
+  __ret_91 = __builtin_shufflevector(__ret_91, __ret_91, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_91; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_f32(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \
+  float32x2_t __s0_92 = __p0_92; \
+  float32x4_t __s2_92 = __p2_92; \
+  float32x2_t __ret_92; \
+  __ret_92 = vset_lane_f32(vgetq_lane_f32(__s2_92, __p3_92), __s0_92, __p1_92); \
+  __ret_92; \
+})
+#else
+#define vcopy_laneq_f32(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \
+  float32x2_t __s0_93 = __p0_93; \
+  float32x4_t __s2_93 = __p2_93; \
+  float32x2_t __rev0_93;  __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \
+  float32x4_t __rev2_93;  __rev2_93 = __builtin_shufflevector(__s2_93, __s2_93, 3, 2, 1, 0); \
+  float32x2_t __ret_93; \
+  __ret_93 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_93, __p3_93), __rev0_93, __p1_93); \
+  __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \
+  __ret_93; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s32(__p0_94, __p1_94, __p2_94, __p3_94) __extension__ ({ \
+  int32x2_t __s0_94 = __p0_94; \
+  int32x4_t __s2_94 = __p2_94; \
+  int32x2_t __ret_94; \
+  __ret_94 = vset_lane_s32(vgetq_lane_s32(__s2_94, __p3_94), __s0_94, __p1_94); \
+  __ret_94; \
+})
+#else
+#define vcopy_laneq_s32(__p0_95, __p1_95, __p2_95, __p3_95) __extension__ ({ \
+  int32x2_t __s0_95 = __p0_95; \
+  int32x4_t __s2_95 = __p2_95; \
+  int32x2_t __rev0_95;  __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 1, 0); \
+  int32x4_t __rev2_95;  __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 3, 2, 1, 0); \
+  int32x2_t __ret_95; \
+  __ret_95 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_95, __p3_95), __rev0_95, __p1_95); \
+  __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 1, 0); \
+  __ret_95; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s64(__p0_96, __p1_96, __p2_96, __p3_96) __extension__ ({ \
+  int64x1_t __s0_96 = __p0_96; \
+  int64x2_t __s2_96 = __p2_96; \
+  int64x1_t __ret_96; \
+  __ret_96 = vset_lane_s64(vgetq_lane_s64(__s2_96, __p3_96), __s0_96, __p1_96); \
+  __ret_96; \
+})
+#else
+#define vcopy_laneq_s64(__p0_97, __p1_97, __p2_97, __p3_97) __extension__ ({ \
+  int64x1_t __s0_97 = __p0_97; \
+  int64x2_t __s2_97 = __p2_97; \
+  int64x2_t __rev2_97;  __rev2_97 = __builtin_shufflevector(__s2_97, __s2_97, 1, 0); \
+  int64x1_t __ret_97; \
+  __ret_97 = __noswap_vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_97, __p3_97), __s0_97, __p1_97); \
+  __ret_97; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s16(__p0_98, __p1_98, __p2_98, __p3_98) __extension__ ({ \
+  int16x4_t __s0_98 = __p0_98; \
+  int16x8_t __s2_98 = __p2_98; \
+  int16x4_t __ret_98; \
+  __ret_98 = vset_lane_s16(vgetq_lane_s16(__s2_98, __p3_98), __s0_98, __p1_98); \
+  __ret_98; \
+})
+#else
+#define vcopy_laneq_s16(__p0_99, __p1_99, __p2_99, __p3_99) __extension__ ({ \
+  int16x4_t __s0_99 = __p0_99; \
+  int16x8_t __s2_99 = __p2_99; \
+  int16x4_t __rev0_99;  __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 3, 2, 1, 0); \
+  int16x8_t __rev2_99;  __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_99; \
+  __ret_99 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_99, __p3_99), __rev0_99, __p1_99); \
+  __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 3, 2, 1, 0); \
+  __ret_99; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 __ai poly64x1_t vcreate_p64(uint64_t __p0) {
   poly64x1_t __ret;
   __ret = (poly64x1_t)(__p0);
@@ -47670,35 +54546,15 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  __ret = vfmaq_f32(__p0, __p1, (float32x4_t) {__p2, __p2, __p2, __p2});
+__ai float64x1_t vfma_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) {
+  float64x1_t __ret;
+  __ret = vfma_f64(__p0, __p1, (float64x1_t) {__p2});
   return __ret;
 }
 #else
-__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __ret;
-  __ret = __noswap_vfmaq_f32(__rev0, __rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  __ret = vfma_f32(__p0, __p1, (float32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __ret;
-  __ret = __noswap_vfma_f32(__rev0, __rev1, (float32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+__ai float64x1_t vfma_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) {
+  float64x1_t __ret;
+  __ret = __noswap_vfma_f64(__p0, __p1, (float64x1_t) {__p2});
   return __ret;
 }
 #endif
@@ -47736,273 +54592,273 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsd_lane_f64(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \
-  float64_t __s0_88 = __p0_88; \
-  float64_t __s1_88 = __p1_88; \
-  float64x1_t __s2_88 = __p2_88; \
-  float64_t __ret_88; \
-  __ret_88 = vfmad_lane_f64(__s0_88, -__s1_88, __s2_88, __p3_88); \
-  __ret_88; \
-})
-#else
-#define vfmsd_lane_f64(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \
-  float64_t __s0_89 = __p0_89; \
-  float64_t __s1_89 = __p1_89; \
-  float64x1_t __s2_89 = __p2_89; \
-  float64_t __ret_89; \
-  __ret_89 = __noswap_vfmad_lane_f64(__s0_89, -__s1_89, __s2_89, __p3_89); \
-  __ret_89; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmss_lane_f32(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \
-  float32_t __s0_90 = __p0_90; \
-  float32_t __s1_90 = __p1_90; \
-  float32x2_t __s2_90 = __p2_90; \
-  float32_t __ret_90; \
-  __ret_90 = vfmas_lane_f32(__s0_90, -__s1_90, __s2_90, __p3_90); \
-  __ret_90; \
-})
-#else
-#define vfmss_lane_f32(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \
-  float32_t __s0_91 = __p0_91; \
-  float32_t __s1_91 = __p1_91; \
-  float32x2_t __s2_91 = __p2_91; \
-  float32x2_t __rev2_91;  __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 1, 0); \
-  float32_t __ret_91; \
-  __ret_91 = __noswap_vfmas_lane_f32(__s0_91, -__s1_91, __rev2_91, __p3_91); \
-  __ret_91; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_lane_f64(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \
-  float64x2_t __s0_92 = __p0_92; \
-  float64x2_t __s1_92 = __p1_92; \
-  float64x1_t __s2_92 = __p2_92; \
-  float64x2_t __ret_92; \
-  __ret_92 = vfmaq_lane_f64(__s0_92, -__s1_92, __s2_92, __p3_92); \
-  __ret_92; \
-})
-#else
-#define vfmsq_lane_f64(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \
-  float64x2_t __s0_93 = __p0_93; \
-  float64x2_t __s1_93 = __p1_93; \
-  float64x1_t __s2_93 = __p2_93; \
-  float64x2_t __rev0_93;  __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \
-  float64x2_t __rev1_93;  __rev1_93 = __builtin_shufflevector(__s1_93, __s1_93, 1, 0); \
-  float64x2_t __ret_93; \
-  __ret_93 = __noswap_vfmaq_lane_f64(__rev0_93, -__rev1_93, __s2_93, __p3_93); \
-  __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \
-  __ret_93; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_lane_f32(__p0_94, __p1_94, __p2_94, __p3_94) __extension__ ({ \
-  float32x4_t __s0_94 = __p0_94; \
-  float32x4_t __s1_94 = __p1_94; \
-  float32x2_t __s2_94 = __p2_94; \
-  float32x4_t __ret_94; \
-  __ret_94 = vfmaq_lane_f32(__s0_94, -__s1_94, __s2_94, __p3_94); \
-  __ret_94; \
-})
-#else
-#define vfmsq_lane_f32(__p0_95, __p1_95, __p2_95, __p3_95) __extension__ ({ \
-  float32x4_t __s0_95 = __p0_95; \
-  float32x4_t __s1_95 = __p1_95; \
-  float32x2_t __s2_95 = __p2_95; \
-  float32x4_t __rev0_95;  __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 3, 2, 1, 0); \
-  float32x4_t __rev1_95;  __rev1_95 = __builtin_shufflevector(__s1_95, __s1_95, 3, 2, 1, 0); \
-  float32x2_t __rev2_95;  __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 1, 0); \
-  float32x4_t __ret_95; \
-  __ret_95 = __noswap_vfmaq_lane_f32(__rev0_95, -__rev1_95, __rev2_95, __p3_95); \
-  __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 3, 2, 1, 0); \
-  __ret_95; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfms_lane_f64(__p0_96, __p1_96, __p2_96, __p3_96) __extension__ ({ \
-  float64x1_t __s0_96 = __p0_96; \
-  float64x1_t __s1_96 = __p1_96; \
-  float64x1_t __s2_96 = __p2_96; \
-  float64x1_t __ret_96; \
-  __ret_96 = vfma_lane_f64(__s0_96, -__s1_96, __s2_96, __p3_96); \
-  __ret_96; \
-})
-#else
-#define vfms_lane_f64(__p0_97, __p1_97, __p2_97, __p3_97) __extension__ ({ \
-  float64x1_t __s0_97 = __p0_97; \
-  float64x1_t __s1_97 = __p1_97; \
-  float64x1_t __s2_97 = __p2_97; \
-  float64x1_t __ret_97; \
-  __ret_97 = __noswap_vfma_lane_f64(__s0_97, -__s1_97, __s2_97, __p3_97); \
-  __ret_97; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfms_lane_f32(__p0_98, __p1_98, __p2_98, __p3_98) __extension__ ({ \
-  float32x2_t __s0_98 = __p0_98; \
-  float32x2_t __s1_98 = __p1_98; \
-  float32x2_t __s2_98 = __p2_98; \
-  float32x2_t __ret_98; \
-  __ret_98 = vfma_lane_f32(__s0_98, -__s1_98, __s2_98, __p3_98); \
-  __ret_98; \
-})
-#else
-#define vfms_lane_f32(__p0_99, __p1_99, __p2_99, __p3_99) __extension__ ({ \
-  float32x2_t __s0_99 = __p0_99; \
-  float32x2_t __s1_99 = __p1_99; \
-  float32x2_t __s2_99 = __p2_99; \
-  float32x2_t __rev0_99;  __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 1, 0); \
-  float32x2_t __rev1_99;  __rev1_99 = __builtin_shufflevector(__s1_99, __s1_99, 1, 0); \
-  float32x2_t __rev2_99;  __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 1, 0); \
-  float32x2_t __ret_99; \
-  __ret_99 = __noswap_vfma_lane_f32(__rev0_99, -__rev1_99, __rev2_99, __p3_99); \
-  __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 1, 0); \
-  __ret_99; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsd_laneq_f64(__p0_100, __p1_100, __p2_100, __p3_100) __extension__ ({ \
+#define vfmsd_lane_f64(__p0_100, __p1_100, __p2_100, __p3_100) __extension__ ({ \
   float64_t __s0_100 = __p0_100; \
   float64_t __s1_100 = __p1_100; \
-  float64x2_t __s2_100 = __p2_100; \
+  float64x1_t __s2_100 = __p2_100; \
   float64_t __ret_100; \
-  __ret_100 = vfmad_laneq_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \
+  __ret_100 = vfmad_lane_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \
   __ret_100; \
 })
 #else
-#define vfmsd_laneq_f64(__p0_101, __p1_101, __p2_101, __p3_101) __extension__ ({ \
+#define vfmsd_lane_f64(__p0_101, __p1_101, __p2_101, __p3_101) __extension__ ({ \
   float64_t __s0_101 = __p0_101; \
   float64_t __s1_101 = __p1_101; \
-  float64x2_t __s2_101 = __p2_101; \
-  float64x2_t __rev2_101;  __rev2_101 = __builtin_shufflevector(__s2_101, __s2_101, 1, 0); \
+  float64x1_t __s2_101 = __p2_101; \
   float64_t __ret_101; \
-  __ret_101 = __noswap_vfmad_laneq_f64(__s0_101, -__s1_101, __rev2_101, __p3_101); \
+  __ret_101 = __noswap_vfmad_lane_f64(__s0_101, -__s1_101, __s2_101, __p3_101); \
   __ret_101; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmss_laneq_f32(__p0_102, __p1_102, __p2_102, __p3_102) __extension__ ({ \
+#define vfmss_lane_f32(__p0_102, __p1_102, __p2_102, __p3_102) __extension__ ({ \
   float32_t __s0_102 = __p0_102; \
   float32_t __s1_102 = __p1_102; \
-  float32x4_t __s2_102 = __p2_102; \
+  float32x2_t __s2_102 = __p2_102; \
   float32_t __ret_102; \
-  __ret_102 = vfmas_laneq_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \
+  __ret_102 = vfmas_lane_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \
   __ret_102; \
 })
 #else
-#define vfmss_laneq_f32(__p0_103, __p1_103, __p2_103, __p3_103) __extension__ ({ \
+#define vfmss_lane_f32(__p0_103, __p1_103, __p2_103, __p3_103) __extension__ ({ \
   float32_t __s0_103 = __p0_103; \
   float32_t __s1_103 = __p1_103; \
-  float32x4_t __s2_103 = __p2_103; \
-  float32x4_t __rev2_103;  __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 3, 2, 1, 0); \
+  float32x2_t __s2_103 = __p2_103; \
+  float32x2_t __rev2_103;  __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 1, 0); \
   float32_t __ret_103; \
-  __ret_103 = __noswap_vfmas_laneq_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \
+  __ret_103 = __noswap_vfmas_lane_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \
   __ret_103; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsq_laneq_f64(__p0_104, __p1_104, __p2_104, __p3_104) __extension__ ({ \
+#define vfmsq_lane_f64(__p0_104, __p1_104, __p2_104, __p3_104) __extension__ ({ \
   float64x2_t __s0_104 = __p0_104; \
   float64x2_t __s1_104 = __p1_104; \
-  float64x2_t __s2_104 = __p2_104; \
+  float64x1_t __s2_104 = __p2_104; \
   float64x2_t __ret_104; \
-  __ret_104 = vfmaq_laneq_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \
+  __ret_104 = vfmaq_lane_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \
   __ret_104; \
 })
 #else
-#define vfmsq_laneq_f64(__p0_105, __p1_105, __p2_105, __p3_105) __extension__ ({ \
+#define vfmsq_lane_f64(__p0_105, __p1_105, __p2_105, __p3_105) __extension__ ({ \
   float64x2_t __s0_105 = __p0_105; \
   float64x2_t __s1_105 = __p1_105; \
-  float64x2_t __s2_105 = __p2_105; \
+  float64x1_t __s2_105 = __p2_105; \
   float64x2_t __rev0_105;  __rev0_105 = __builtin_shufflevector(__s0_105, __s0_105, 1, 0); \
   float64x2_t __rev1_105;  __rev1_105 = __builtin_shufflevector(__s1_105, __s1_105, 1, 0); \
-  float64x2_t __rev2_105;  __rev2_105 = __builtin_shufflevector(__s2_105, __s2_105, 1, 0); \
   float64x2_t __ret_105; \
-  __ret_105 = __noswap_vfmaq_laneq_f64(__rev0_105, -__rev1_105, __rev2_105, __p3_105); \
+  __ret_105 = __noswap_vfmaq_lane_f64(__rev0_105, -__rev1_105, __s2_105, __p3_105); \
   __ret_105 = __builtin_shufflevector(__ret_105, __ret_105, 1, 0); \
   __ret_105; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsq_laneq_f32(__p0_106, __p1_106, __p2_106, __p3_106) __extension__ ({ \
+#define vfmsq_lane_f32(__p0_106, __p1_106, __p2_106, __p3_106) __extension__ ({ \
   float32x4_t __s0_106 = __p0_106; \
   float32x4_t __s1_106 = __p1_106; \
-  float32x4_t __s2_106 = __p2_106; \
+  float32x2_t __s2_106 = __p2_106; \
   float32x4_t __ret_106; \
-  __ret_106 = vfmaq_laneq_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \
+  __ret_106 = vfmaq_lane_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \
   __ret_106; \
 })
 #else
-#define vfmsq_laneq_f32(__p0_107, __p1_107, __p2_107, __p3_107) __extension__ ({ \
+#define vfmsq_lane_f32(__p0_107, __p1_107, __p2_107, __p3_107) __extension__ ({ \
   float32x4_t __s0_107 = __p0_107; \
   float32x4_t __s1_107 = __p1_107; \
-  float32x4_t __s2_107 = __p2_107; \
+  float32x2_t __s2_107 = __p2_107; \
   float32x4_t __rev0_107;  __rev0_107 = __builtin_shufflevector(__s0_107, __s0_107, 3, 2, 1, 0); \
   float32x4_t __rev1_107;  __rev1_107 = __builtin_shufflevector(__s1_107, __s1_107, 3, 2, 1, 0); \
-  float32x4_t __rev2_107;  __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 3, 2, 1, 0); \
+  float32x2_t __rev2_107;  __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 1, 0); \
   float32x4_t __ret_107; \
-  __ret_107 = __noswap_vfmaq_laneq_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \
+  __ret_107 = __noswap_vfmaq_lane_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \
   __ret_107 = __builtin_shufflevector(__ret_107, __ret_107, 3, 2, 1, 0); \
   __ret_107; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfms_laneq_f64(__p0_108, __p1_108, __p2_108, __p3_108) __extension__ ({ \
+#define vfms_lane_f64(__p0_108, __p1_108, __p2_108, __p3_108) __extension__ ({ \
   float64x1_t __s0_108 = __p0_108; \
   float64x1_t __s1_108 = __p1_108; \
-  float64x2_t __s2_108 = __p2_108; \
+  float64x1_t __s2_108 = __p2_108; \
   float64x1_t __ret_108; \
-  __ret_108 = vfma_laneq_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \
+  __ret_108 = vfma_lane_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \
   __ret_108; \
 })
 #else
-#define vfms_laneq_f64(__p0_109, __p1_109, __p2_109, __p3_109) __extension__ ({ \
+#define vfms_lane_f64(__p0_109, __p1_109, __p2_109, __p3_109) __extension__ ({ \
   float64x1_t __s0_109 = __p0_109; \
   float64x1_t __s1_109 = __p1_109; \
-  float64x2_t __s2_109 = __p2_109; \
-  float64x2_t __rev2_109;  __rev2_109 = __builtin_shufflevector(__s2_109, __s2_109, 1, 0); \
+  float64x1_t __s2_109 = __p2_109; \
   float64x1_t __ret_109; \
-  __ret_109 = __noswap_vfma_laneq_f64(__s0_109, -__s1_109, __rev2_109, __p3_109); \
+  __ret_109 = __noswap_vfma_lane_f64(__s0_109, -__s1_109, __s2_109, __p3_109); \
   __ret_109; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfms_laneq_f32(__p0_110, __p1_110, __p2_110, __p3_110) __extension__ ({ \
+#define vfms_lane_f32(__p0_110, __p1_110, __p2_110, __p3_110) __extension__ ({ \
   float32x2_t __s0_110 = __p0_110; \
   float32x2_t __s1_110 = __p1_110; \
-  float32x4_t __s2_110 = __p2_110; \
+  float32x2_t __s2_110 = __p2_110; \
   float32x2_t __ret_110; \
-  __ret_110 = vfma_laneq_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \
+  __ret_110 = vfma_lane_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \
   __ret_110; \
 })
 #else
-#define vfms_laneq_f32(__p0_111, __p1_111, __p2_111, __p3_111) __extension__ ({ \
+#define vfms_lane_f32(__p0_111, __p1_111, __p2_111, __p3_111) __extension__ ({ \
   float32x2_t __s0_111 = __p0_111; \
   float32x2_t __s1_111 = __p1_111; \
-  float32x4_t __s2_111 = __p2_111; \
+  float32x2_t __s2_111 = __p2_111; \
   float32x2_t __rev0_111;  __rev0_111 = __builtin_shufflevector(__s0_111, __s0_111, 1, 0); \
   float32x2_t __rev1_111;  __rev1_111 = __builtin_shufflevector(__s1_111, __s1_111, 1, 0); \
-  float32x4_t __rev2_111;  __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 3, 2, 1, 0); \
+  float32x2_t __rev2_111;  __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 1, 0); \
   float32x2_t __ret_111; \
-  __ret_111 = __noswap_vfma_laneq_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \
+  __ret_111 = __noswap_vfma_lane_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \
   __ret_111 = __builtin_shufflevector(__ret_111, __ret_111, 1, 0); \
   __ret_111; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vfmsd_laneq_f64(__p0_112, __p1_112, __p2_112, __p3_112) __extension__ ({ \
+  float64_t __s0_112 = __p0_112; \
+  float64_t __s1_112 = __p1_112; \
+  float64x2_t __s2_112 = __p2_112; \
+  float64_t __ret_112; \
+  __ret_112 = vfmad_laneq_f64(__s0_112, -__s1_112, __s2_112, __p3_112); \
+  __ret_112; \
+})
+#else
+#define vfmsd_laneq_f64(__p0_113, __p1_113, __p2_113, __p3_113) __extension__ ({ \
+  float64_t __s0_113 = __p0_113; \
+  float64_t __s1_113 = __p1_113; \
+  float64x2_t __s2_113 = __p2_113; \
+  float64x2_t __rev2_113;  __rev2_113 = __builtin_shufflevector(__s2_113, __s2_113, 1, 0); \
+  float64_t __ret_113; \
+  __ret_113 = __noswap_vfmad_laneq_f64(__s0_113, -__s1_113, __rev2_113, __p3_113); \
+  __ret_113; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmss_laneq_f32(__p0_114, __p1_114, __p2_114, __p3_114) __extension__ ({ \
+  float32_t __s0_114 = __p0_114; \
+  float32_t __s1_114 = __p1_114; \
+  float32x4_t __s2_114 = __p2_114; \
+  float32_t __ret_114; \
+  __ret_114 = vfmas_laneq_f32(__s0_114, -__s1_114, __s2_114, __p3_114); \
+  __ret_114; \
+})
+#else
+#define vfmss_laneq_f32(__p0_115, __p1_115, __p2_115, __p3_115) __extension__ ({ \
+  float32_t __s0_115 = __p0_115; \
+  float32_t __s1_115 = __p1_115; \
+  float32x4_t __s2_115 = __p2_115; \
+  float32x4_t __rev2_115;  __rev2_115 = __builtin_shufflevector(__s2_115, __s2_115, 3, 2, 1, 0); \
+  float32_t __ret_115; \
+  __ret_115 = __noswap_vfmas_laneq_f32(__s0_115, -__s1_115, __rev2_115, __p3_115); \
+  __ret_115; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_laneq_f64(__p0_116, __p1_116, __p2_116, __p3_116) __extension__ ({ \
+  float64x2_t __s0_116 = __p0_116; \
+  float64x2_t __s1_116 = __p1_116; \
+  float64x2_t __s2_116 = __p2_116; \
+  float64x2_t __ret_116; \
+  __ret_116 = vfmaq_laneq_f64(__s0_116, -__s1_116, __s2_116, __p3_116); \
+  __ret_116; \
+})
+#else
+#define vfmsq_laneq_f64(__p0_117, __p1_117, __p2_117, __p3_117) __extension__ ({ \
+  float64x2_t __s0_117 = __p0_117; \
+  float64x2_t __s1_117 = __p1_117; \
+  float64x2_t __s2_117 = __p2_117; \
+  float64x2_t __rev0_117;  __rev0_117 = __builtin_shufflevector(__s0_117, __s0_117, 1, 0); \
+  float64x2_t __rev1_117;  __rev1_117 = __builtin_shufflevector(__s1_117, __s1_117, 1, 0); \
+  float64x2_t __rev2_117;  __rev2_117 = __builtin_shufflevector(__s2_117, __s2_117, 1, 0); \
+  float64x2_t __ret_117; \
+  __ret_117 = __noswap_vfmaq_laneq_f64(__rev0_117, -__rev1_117, __rev2_117, __p3_117); \
+  __ret_117 = __builtin_shufflevector(__ret_117, __ret_117, 1, 0); \
+  __ret_117; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_laneq_f32(__p0_118, __p1_118, __p2_118, __p3_118) __extension__ ({ \
+  float32x4_t __s0_118 = __p0_118; \
+  float32x4_t __s1_118 = __p1_118; \
+  float32x4_t __s2_118 = __p2_118; \
+  float32x4_t __ret_118; \
+  __ret_118 = vfmaq_laneq_f32(__s0_118, -__s1_118, __s2_118, __p3_118); \
+  __ret_118; \
+})
+#else
+#define vfmsq_laneq_f32(__p0_119, __p1_119, __p2_119, __p3_119) __extension__ ({ \
+  float32x4_t __s0_119 = __p0_119; \
+  float32x4_t __s1_119 = __p1_119; \
+  float32x4_t __s2_119 = __p2_119; \
+  float32x4_t __rev0_119;  __rev0_119 = __builtin_shufflevector(__s0_119, __s0_119, 3, 2, 1, 0); \
+  float32x4_t __rev1_119;  __rev1_119 = __builtin_shufflevector(__s1_119, __s1_119, 3, 2, 1, 0); \
+  float32x4_t __rev2_119;  __rev2_119 = __builtin_shufflevector(__s2_119, __s2_119, 3, 2, 1, 0); \
+  float32x4_t __ret_119; \
+  __ret_119 = __noswap_vfmaq_laneq_f32(__rev0_119, -__rev1_119, __rev2_119, __p3_119); \
+  __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 3, 2, 1, 0); \
+  __ret_119; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_laneq_f64(__p0_120, __p1_120, __p2_120, __p3_120) __extension__ ({ \
+  float64x1_t __s0_120 = __p0_120; \
+  float64x1_t __s1_120 = __p1_120; \
+  float64x2_t __s2_120 = __p2_120; \
+  float64x1_t __ret_120; \
+  __ret_120 = vfma_laneq_f64(__s0_120, -__s1_120, __s2_120, __p3_120); \
+  __ret_120; \
+})
+#else
+#define vfms_laneq_f64(__p0_121, __p1_121, __p2_121, __p3_121) __extension__ ({ \
+  float64x1_t __s0_121 = __p0_121; \
+  float64x1_t __s1_121 = __p1_121; \
+  float64x2_t __s2_121 = __p2_121; \
+  float64x2_t __rev2_121;  __rev2_121 = __builtin_shufflevector(__s2_121, __s2_121, 1, 0); \
+  float64x1_t __ret_121; \
+  __ret_121 = __noswap_vfma_laneq_f64(__s0_121, -__s1_121, __rev2_121, __p3_121); \
+  __ret_121; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_laneq_f32(__p0_122, __p1_122, __p2_122, __p3_122) __extension__ ({ \
+  float32x2_t __s0_122 = __p0_122; \
+  float32x2_t __s1_122 = __p1_122; \
+  float32x4_t __s2_122 = __p2_122; \
+  float32x2_t __ret_122; \
+  __ret_122 = vfma_laneq_f32(__s0_122, -__s1_122, __s2_122, __p3_122); \
+  __ret_122; \
+})
+#else
+#define vfms_laneq_f32(__p0_123, __p1_123, __p2_123, __p3_123) __extension__ ({ \
+  float32x2_t __s0_123 = __p0_123; \
+  float32x2_t __s1_123 = __p1_123; \
+  float32x4_t __s2_123 = __p2_123; \
+  float32x2_t __rev0_123;  __rev0_123 = __builtin_shufflevector(__s0_123, __s0_123, 1, 0); \
+  float32x2_t __rev1_123;  __rev1_123 = __builtin_shufflevector(__s1_123, __s1_123, 1, 0); \
+  float32x4_t __rev2_123;  __rev2_123 = __builtin_shufflevector(__s2_123, __s2_123, 3, 2, 1, 0); \
+  float32x2_t __ret_123; \
+  __ret_123 = __noswap_vfma_laneq_f32(__rev0_123, -__rev1_123, __rev2_123, __p3_123); \
+  __ret_123 = __builtin_shufflevector(__ret_123, __ret_123, 1, 0); \
+  __ret_123; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 __ai float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
   float64x2_t __ret;
   __ret = vfmaq_f64(__p0, -__p1, (float64x2_t) {__p2, __p2});
@@ -48037,6 +54893,20 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vfms_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) {
+  float64x1_t __ret;
+  __ret = vfma_f64(__p0, -__p1, (float64x1_t) {__p2});
+  return __ret;
+}
+#else
+__ai float64x1_t vfms_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) {
+  float64x1_t __ret;
+  __ret = __noswap_vfma_f64(__p0, -__p1, (float64x1_t) {__p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 __ai float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
   float32x2_t __ret;
   __ret = vfma_f32(__p0, -__p1, (float32x2_t) {__p2, __p2});
@@ -48393,23 +55263,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1_p8_x2(__p0) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld1_p8_x2(__p0) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1_p64_x2(__p0) __extension__ ({ \
   poly64x1x2_t __ret; \
   __builtin_neon_vld1_x2_v(&__ret, __p0, 6); \
@@ -48424,40 +55277,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1_p16_x2(__p0) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld1_p16_x2(__p0) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p8_x2(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld1q_p8_x2(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1q_p64_x2(__p0) __extension__ ({ \
   poly64x2x2_t __ret; \
   __builtin_neon_vld1q_x2_v(&__ret, __p0, 38); \
@@ -48475,108 +55294,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1q_p16_x2(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld1q_p16_x2(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u8_x2(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld1q_u8_x2(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u32_x2(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld1q_u32_x2(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u64_x2(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld1q_u64_x2(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u16_x2(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld1q_u16_x2(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s8_x2(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld1q_s8_x2(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1q_f64_x2(__p0) __extension__ ({ \
   float64x2x2_t __ret; \
   __builtin_neon_vld1q_x2_v(&__ret, __p0, 42); \
@@ -48594,173 +55311,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1q_f32_x2(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld1q_f32_x2(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f16_x2(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld1q_f16_x2(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s32_x2(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld1q_s32_x2(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s64_x2(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld1q_s64_x2(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s16_x2(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld1q_s16_x2(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u8_x2(__p0) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld1_u8_x2(__p0) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u32_x2(__p0) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld1_u32_x2(__p0) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u64_x2(__p0) __extension__ ({ \
-  uint64x1x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#else
-#define vld1_u64_x2(__p0) __extension__ ({ \
-  uint64x1x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u16_x2(__p0) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld1_u16_x2(__p0) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s8_x2(__p0) __extension__ ({ \
-  int8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld1_s8_x2(__p0) __extension__ ({ \
-  int8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1_f64_x2(__p0) __extension__ ({ \
   float64x1x2_t __ret; \
   __builtin_neon_vld1_x2_v(&__ret, __p0, 10); \
@@ -48775,106 +55325,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1_f32_x2(__p0) __extension__ ({ \
-  float32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld1_f32_x2(__p0) __extension__ ({ \
-  float32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f16_x2(__p0) __extension__ ({ \
-  float16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld1_f16_x2(__p0) __extension__ ({ \
-  float16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s32_x2(__p0) __extension__ ({ \
-  int32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld1_s32_x2(__p0) __extension__ ({ \
-  int32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s64_x2(__p0) __extension__ ({ \
-  int64x1x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#else
-#define vld1_s64_x2(__p0) __extension__ ({ \
-  int64x1x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s16_x2(__p0) __extension__ ({ \
-  int16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld1_s16_x2(__p0) __extension__ ({ \
-  int16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p8_x3(__p0) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld1_p8_x3(__p0) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1_p64_x3(__p0) __extension__ ({ \
   poly64x1x3_t __ret; \
   __builtin_neon_vld1_x3_v(&__ret, __p0, 6); \
@@ -48889,42 +55339,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1_p16_x3(__p0) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld1_p16_x3(__p0) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p8_x3(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld1q_p8_x3(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1q_p64_x3(__p0) __extension__ ({ \
   poly64x2x3_t __ret; \
   __builtin_neon_vld1q_x3_v(&__ret, __p0, 38); \
@@ -48943,114 +55357,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1q_p16_x3(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld1q_p16_x3(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u8_x3(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld1q_u8_x3(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u32_x3(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld1q_u32_x3(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u64_x3(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld1q_u64_x3(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u16_x3(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld1q_u16_x3(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s8_x3(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld1q_s8_x3(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1q_f64_x3(__p0) __extension__ ({ \
   float64x2x3_t __ret; \
   __builtin_neon_vld1q_x3_v(&__ret, __p0, 42); \
@@ -49069,182 +55375,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1q_f32_x3(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld1q_f32_x3(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f16_x3(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld1q_f16_x3(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s32_x3(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld1q_s32_x3(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s64_x3(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld1q_s64_x3(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s16_x3(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld1q_s16_x3(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u8_x3(__p0) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld1_u8_x3(__p0) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u32_x3(__p0) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld1_u32_x3(__p0) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u64_x3(__p0) __extension__ ({ \
-  uint64x1x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#else
-#define vld1_u64_x3(__p0) __extension__ ({ \
-  uint64x1x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u16_x3(__p0) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld1_u16_x3(__p0) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s8_x3(__p0) __extension__ ({ \
-  int8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld1_s8_x3(__p0) __extension__ ({ \
-  int8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1_f64_x3(__p0) __extension__ ({ \
   float64x1x3_t __ret; \
   __builtin_neon_vld1_x3_v(&__ret, __p0, 10); \
@@ -49259,111 +55389,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1_f32_x3(__p0) __extension__ ({ \
-  float32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld1_f32_x3(__p0) __extension__ ({ \
-  float32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f16_x3(__p0) __extension__ ({ \
-  float16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld1_f16_x3(__p0) __extension__ ({ \
-  float16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s32_x3(__p0) __extension__ ({ \
-  int32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld1_s32_x3(__p0) __extension__ ({ \
-  int32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s64_x3(__p0) __extension__ ({ \
-  int64x1x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#else
-#define vld1_s64_x3(__p0) __extension__ ({ \
-  int64x1x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s16_x3(__p0) __extension__ ({ \
-  int16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld1_s16_x3(__p0) __extension__ ({ \
-  int16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p8_x4(__p0) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld1_p8_x4(__p0) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1_p64_x4(__p0) __extension__ ({ \
   poly64x1x4_t __ret; \
   __builtin_neon_vld1_x4_v(&__ret, __p0, 6); \
@@ -49378,44 +55403,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1_p16_x4(__p0) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld1_p16_x4(__p0) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p8_x4(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld1q_p8_x4(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1q_p64_x4(__p0) __extension__ ({ \
   poly64x2x4_t __ret; \
   __builtin_neon_vld1q_x4_v(&__ret, __p0, 38); \
@@ -49435,120 +55422,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1q_p16_x4(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld1q_p16_x4(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u8_x4(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld1q_u8_x4(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u32_x4(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld1q_u32_x4(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u64_x4(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld1q_u64_x4(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u16_x4(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld1q_u16_x4(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s8_x4(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld1q_s8_x4(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1q_f64_x4(__p0) __extension__ ({ \
   float64x2x4_t __ret; \
   __builtin_neon_vld1q_x4_v(&__ret, __p0, 42); \
@@ -49568,191 +55441,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1q_f32_x4(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld1q_f32_x4(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f16_x4(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld1q_f16_x4(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s32_x4(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld1q_s32_x4(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s64_x4(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld1q_s64_x4(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s16_x4(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld1q_s16_x4(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u8_x4(__p0) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld1_u8_x4(__p0) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u32_x4(__p0) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld1_u32_x4(__p0) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u64_x4(__p0) __extension__ ({ \
-  uint64x1x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#else
-#define vld1_u64_x4(__p0) __extension__ ({ \
-  uint64x1x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u16_x4(__p0) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld1_u16_x4(__p0) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s8_x4(__p0) __extension__ ({ \
-  int8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld1_s8_x4(__p0) __extension__ ({ \
-  int8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld1_f64_x4(__p0) __extension__ ({ \
   float64x1x4_t __ret; \
   __builtin_neon_vld1_x4_v(&__ret, __p0, 10); \
@@ -49767,96 +55455,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld1_f32_x4(__p0) __extension__ ({ \
-  float32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld1_f32_x4(__p0) __extension__ ({ \
-  float32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f16_x4(__p0) __extension__ ({ \
-  float16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld1_f16_x4(__p0) __extension__ ({ \
-  float16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s32_x4(__p0) __extension__ ({ \
-  int32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld1_s32_x4(__p0) __extension__ ({ \
-  int32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s64_x4(__p0) __extension__ ({ \
-  int64x1x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#else
-#define vld1_s64_x4(__p0) __extension__ ({ \
-  int64x1x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s16_x4(__p0) __extension__ ({ \
-  int16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld1_s16_x4(__p0) __extension__ ({ \
-  int16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld2_p64(__p0) __extension__ ({ \
   poly64x1x2_t __ret; \
   __builtin_neon_vld2_v(&__ret, __p0, 6); \
@@ -49967,23 +55565,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld2q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld2q_dup_p64(__p0) __extension__ ({ \
   poly64x2x2_t __ret; \
   __builtin_neon_vld2q_dup_v(&__ret, __p0, 38); \
@@ -50001,108 +55582,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld2q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld2q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld2q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld2q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld2q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_s8(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld2q_dup_s8(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld2q_dup_f64(__p0) __extension__ ({ \
   float64x2x2_t __ret; \
   __builtin_neon_vld2q_dup_v(&__ret, __p0, 42); \
@@ -50120,91 +55599,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_f32(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld2q_dup_f32(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_f16(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld2q_dup_f16(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_s32(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld2q_dup_s32(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_s64(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld2q_dup_s64(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_s16(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld2q_dup_s16(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld2_dup_f64(__p0) __extension__ ({ \
   float64x1x2_t __ret; \
   __builtin_neon_vld2_dup_v(&__ret, __p0, 10); \
@@ -50551,24 +55945,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld3q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld3q_dup_p64(__p0) __extension__ ({ \
   poly64x2x3_t __ret; \
   __builtin_neon_vld3q_dup_v(&__ret, __p0, 38); \
@@ -50587,114 +55963,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld3q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld3q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld3q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld3q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld3q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_s8(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld3q_dup_s8(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld3q_dup_f64(__p0) __extension__ ({ \
   float64x2x3_t __ret; \
   __builtin_neon_vld3q_dup_v(&__ret, __p0, 42); \
@@ -50713,96 +55981,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_f32(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld3q_dup_f32(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_f16(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld3q_dup_f16(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_s32(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld3q_dup_s32(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_s64(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld3q_dup_s64(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_s16(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld3q_dup_s16(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld3_dup_f64(__p0) __extension__ ({ \
   float64x1x3_t __ret; \
   __builtin_neon_vld3_dup_v(&__ret, __p0, 10); \
@@ -51167,25 +56345,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld4q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld4q_dup_p64(__p0) __extension__ ({ \
   poly64x2x4_t __ret; \
   __builtin_neon_vld4q_dup_v(&__ret, __p0, 38); \
@@ -51205,120 +56364,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld4q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld4q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld4q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld4q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld4q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_s8(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld4q_dup_s8(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld4q_dup_f64(__p0) __extension__ ({ \
   float64x2x4_t __ret; \
   __builtin_neon_vld4q_dup_v(&__ret, __p0, 42); \
@@ -51338,101 +56383,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_f32(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld4q_dup_f32(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_f16(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld4q_dup_f16(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_s32(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld4q_dup_s32(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_s64(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld4q_dup_s64(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_s16(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld4q_dup_s16(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vld4_dup_f64(__p0) __extension__ ({ \
   float64x1x4_t __ret; \
   __builtin_neon_vld4_dup_v(&__ret, __p0, 10); \
@@ -53521,150 +58471,150 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_112) {
-  uint16x8_t __ret_112;
-  uint8x8_t __a1_112 = vget_high_u8(__p0_112);
-  __ret_112 = (uint16x8_t)(vshll_n_u8(__a1_112, 0));
-  return __ret_112;
-}
-#else
-__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_113) {
-  uint8x16_t __rev0_113;  __rev0_113 = __builtin_shufflevector(__p0_113, __p0_113, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __ret_113;
-  uint8x8_t __a1_113 = __noswap_vget_high_u8(__rev0_113);
-  __ret_113 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_113, 0));
-  __ret_113 = __builtin_shufflevector(__ret_113, __ret_113, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret_113;
-}
-__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_114) {
-  uint16x8_t __ret_114;
-  uint8x8_t __a1_114 = __noswap_vget_high_u8(__p0_114);
-  __ret_114 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_114, 0));
-  return __ret_114;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_115) {
-  uint64x2_t __ret_115;
-  uint32x2_t __a1_115 = vget_high_u32(__p0_115);
-  __ret_115 = (uint64x2_t)(vshll_n_u32(__a1_115, 0));
-  return __ret_115;
-}
-#else
-__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_116) {
-  uint32x4_t __rev0_116;  __rev0_116 = __builtin_shufflevector(__p0_116, __p0_116, 3, 2, 1, 0);
-  uint64x2_t __ret_116;
-  uint32x2_t __a1_116 = __noswap_vget_high_u32(__rev0_116);
-  __ret_116 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_116, 0));
-  __ret_116 = __builtin_shufflevector(__ret_116, __ret_116, 1, 0);
-  return __ret_116;
-}
-__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_117) {
-  uint64x2_t __ret_117;
-  uint32x2_t __a1_117 = __noswap_vget_high_u32(__p0_117);
-  __ret_117 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_117, 0));
-  return __ret_117;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_118) {
-  uint32x4_t __ret_118;
-  uint16x4_t __a1_118 = vget_high_u16(__p0_118);
-  __ret_118 = (uint32x4_t)(vshll_n_u16(__a1_118, 0));
-  return __ret_118;
-}
-#else
-__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_119) {
-  uint16x8_t __rev0_119;  __rev0_119 = __builtin_shufflevector(__p0_119, __p0_119, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint32x4_t __ret_119;
-  uint16x4_t __a1_119 = __noswap_vget_high_u16(__rev0_119);
-  __ret_119 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_119, 0));
-  __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 3, 2, 1, 0);
-  return __ret_119;
-}
-__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_120) {
-  uint32x4_t __ret_120;
-  uint16x4_t __a1_120 = __noswap_vget_high_u16(__p0_120);
-  __ret_120 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_120, 0));
-  return __ret_120;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai int16x8_t vmovl_high_s8(int8x16_t __p0_121) {
-  int16x8_t __ret_121;
-  int8x8_t __a1_121 = vget_high_s8(__p0_121);
-  __ret_121 = (int16x8_t)(vshll_n_s8(__a1_121, 0));
-  return __ret_121;
-}
-#else
-__ai int16x8_t vmovl_high_s8(int8x16_t __p0_122) {
-  int8x16_t __rev0_122;  __rev0_122 = __builtin_shufflevector(__p0_122, __p0_122, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __ret_122;
-  int8x8_t __a1_122 = __noswap_vget_high_s8(__rev0_122);
-  __ret_122 = (int16x8_t)(__noswap_vshll_n_s8(__a1_122, 0));
-  __ret_122 = __builtin_shufflevector(__ret_122, __ret_122, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret_122;
-}
-__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_123) {
-  int16x8_t __ret_123;
-  int8x8_t __a1_123 = __noswap_vget_high_s8(__p0_123);
-  __ret_123 = (int16x8_t)(__noswap_vshll_n_s8(__a1_123, 0));
-  return __ret_123;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai int64x2_t vmovl_high_s32(int32x4_t __p0_124) {
-  int64x2_t __ret_124;
-  int32x2_t __a1_124 = vget_high_s32(__p0_124);
-  __ret_124 = (int64x2_t)(vshll_n_s32(__a1_124, 0));
+__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_124) {
+  uint16x8_t __ret_124;
+  uint8x8_t __a1_124 = vget_high_u8(__p0_124);
+  __ret_124 = (uint16x8_t)(vshll_n_u8(__a1_124, 0));
   return __ret_124;
 }
 #else
-__ai int64x2_t vmovl_high_s32(int32x4_t __p0_125) {
-  int32x4_t __rev0_125;  __rev0_125 = __builtin_shufflevector(__p0_125, __p0_125, 3, 2, 1, 0);
-  int64x2_t __ret_125;
-  int32x2_t __a1_125 = __noswap_vget_high_s32(__rev0_125);
-  __ret_125 = (int64x2_t)(__noswap_vshll_n_s32(__a1_125, 0));
-  __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 1, 0);
+__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_125) {
+  uint8x16_t __rev0_125;  __rev0_125 = __builtin_shufflevector(__p0_125, __p0_125, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret_125;
+  uint8x8_t __a1_125 = __noswap_vget_high_u8(__rev0_125);
+  __ret_125 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_125, 0));
+  __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 7, 6, 5, 4, 3, 2, 1, 0);
   return __ret_125;
 }
-__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_126) {
-  int64x2_t __ret_126;
-  int32x2_t __a1_126 = __noswap_vget_high_s32(__p0_126);
-  __ret_126 = (int64x2_t)(__noswap_vshll_n_s32(__a1_126, 0));
+__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_126) {
+  uint16x8_t __ret_126;
+  uint8x8_t __a1_126 = __noswap_vget_high_u8(__p0_126);
+  __ret_126 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_126, 0));
   return __ret_126;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int32x4_t vmovl_high_s16(int16x8_t __p0_127) {
-  int32x4_t __ret_127;
-  int16x4_t __a1_127 = vget_high_s16(__p0_127);
-  __ret_127 = (int32x4_t)(vshll_n_s16(__a1_127, 0));
+__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_127) {
+  uint64x2_t __ret_127;
+  uint32x2_t __a1_127 = vget_high_u32(__p0_127);
+  __ret_127 = (uint64x2_t)(vshll_n_u32(__a1_127, 0));
   return __ret_127;
 }
 #else
-__ai int32x4_t vmovl_high_s16(int16x8_t __p0_128) {
-  int16x8_t __rev0_128;  __rev0_128 = __builtin_shufflevector(__p0_128, __p0_128, 7, 6, 5, 4, 3, 2, 1, 0);
-  int32x4_t __ret_128;
-  int16x4_t __a1_128 = __noswap_vget_high_s16(__rev0_128);
-  __ret_128 = (int32x4_t)(__noswap_vshll_n_s16(__a1_128, 0));
-  __ret_128 = __builtin_shufflevector(__ret_128, __ret_128, 3, 2, 1, 0);
+__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_128) {
+  uint32x4_t __rev0_128;  __rev0_128 = __builtin_shufflevector(__p0_128, __p0_128, 3, 2, 1, 0);
+  uint64x2_t __ret_128;
+  uint32x2_t __a1_128 = __noswap_vget_high_u32(__rev0_128);
+  __ret_128 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_128, 0));
+  __ret_128 = __builtin_shufflevector(__ret_128, __ret_128, 1, 0);
   return __ret_128;
 }
-__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_129) {
-  int32x4_t __ret_129;
-  int16x4_t __a1_129 = __noswap_vget_high_s16(__p0_129);
-  __ret_129 = (int32x4_t)(__noswap_vshll_n_s16(__a1_129, 0));
+__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_129) {
+  uint64x2_t __ret_129;
+  uint32x2_t __a1_129 = __noswap_vget_high_u32(__p0_129);
+  __ret_129 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_129, 0));
   return __ret_129;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_130) {
+  uint32x4_t __ret_130;
+  uint16x4_t __a1_130 = vget_high_u16(__p0_130);
+  __ret_130 = (uint32x4_t)(vshll_n_u16(__a1_130, 0));
+  return __ret_130;
+}
+#else
+__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_131) {
+  uint16x8_t __rev0_131;  __rev0_131 = __builtin_shufflevector(__p0_131, __p0_131, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret_131;
+  uint16x4_t __a1_131 = __noswap_vget_high_u16(__rev0_131);
+  __ret_131 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_131, 0));
+  __ret_131 = __builtin_shufflevector(__ret_131, __ret_131, 3, 2, 1, 0);
+  return __ret_131;
+}
+__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_132) {
+  uint32x4_t __ret_132;
+  uint16x4_t __a1_132 = __noswap_vget_high_u16(__p0_132);
+  __ret_132 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_132, 0));
+  return __ret_132;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovl_high_s8(int8x16_t __p0_133) {
+  int16x8_t __ret_133;
+  int8x8_t __a1_133 = vget_high_s8(__p0_133);
+  __ret_133 = (int16x8_t)(vshll_n_s8(__a1_133, 0));
+  return __ret_133;
+}
+#else
+__ai int16x8_t vmovl_high_s8(int8x16_t __p0_134) {
+  int8x16_t __rev0_134;  __rev0_134 = __builtin_shufflevector(__p0_134, __p0_134, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret_134;
+  int8x8_t __a1_134 = __noswap_vget_high_s8(__rev0_134);
+  __ret_134 = (int16x8_t)(__noswap_vshll_n_s8(__a1_134, 0));
+  __ret_134 = __builtin_shufflevector(__ret_134, __ret_134, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret_134;
+}
+__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_135) {
+  int16x8_t __ret_135;
+  int8x8_t __a1_135 = __noswap_vget_high_s8(__p0_135);
+  __ret_135 = (int16x8_t)(__noswap_vshll_n_s8(__a1_135, 0));
+  return __ret_135;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmovl_high_s32(int32x4_t __p0_136) {
+  int64x2_t __ret_136;
+  int32x2_t __a1_136 = vget_high_s32(__p0_136);
+  __ret_136 = (int64x2_t)(vshll_n_s32(__a1_136, 0));
+  return __ret_136;
+}
+#else
+__ai int64x2_t vmovl_high_s32(int32x4_t __p0_137) {
+  int32x4_t __rev0_137;  __rev0_137 = __builtin_shufflevector(__p0_137, __p0_137, 3, 2, 1, 0);
+  int64x2_t __ret_137;
+  int32x2_t __a1_137 = __noswap_vget_high_s32(__rev0_137);
+  __ret_137 = (int64x2_t)(__noswap_vshll_n_s32(__a1_137, 0));
+  __ret_137 = __builtin_shufflevector(__ret_137, __ret_137, 1, 0);
+  return __ret_137;
+}
+__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_138) {
+  int64x2_t __ret_138;
+  int32x2_t __a1_138 = __noswap_vget_high_s32(__p0_138);
+  __ret_138 = (int64x2_t)(__noswap_vshll_n_s32(__a1_138, 0));
+  return __ret_138;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovl_high_s16(int16x8_t __p0_139) {
+  int32x4_t __ret_139;
+  int16x4_t __a1_139 = vget_high_s16(__p0_139);
+  __ret_139 = (int32x4_t)(vshll_n_s16(__a1_139, 0));
+  return __ret_139;
+}
+#else
+__ai int32x4_t vmovl_high_s16(int16x8_t __p0_140) {
+  int16x8_t __rev0_140;  __rev0_140 = __builtin_shufflevector(__p0_140, __p0_140, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret_140;
+  int16x4_t __a1_140 = __noswap_vget_high_s16(__rev0_140);
+  __ret_140 = (int32x4_t)(__noswap_vshll_n_s16(__a1_140, 0));
+  __ret_140 = __builtin_shufflevector(__ret_140, __ret_140, 3, 2, 1, 0);
+  return __ret_140;
+}
+__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_141) {
+  int32x4_t __ret_141;
+  int16x4_t __a1_141 = __noswap_vget_high_s16(__p0_141);
+  __ret_141 = (int32x4_t)(__noswap_vshll_n_s16(__a1_141, 0));
+  return __ret_141;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 __ai uint16x8_t vmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
   uint16x8_t __ret;
   __ret = vcombine_u16(__p0, vmovn_u32(__p1));
@@ -53798,39 +58748,39 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmuld_lane_f64(__p0_130, __p1_130, __p2_130) __extension__ ({ \
-  float64_t __s0_130 = __p0_130; \
-  float64x1_t __s1_130 = __p1_130; \
-  float64_t __ret_130; \
-  __ret_130 = __s0_130 * vget_lane_f64(__s1_130, __p2_130); \
-  __ret_130; \
+#define vmuld_lane_f64(__p0_142, __p1_142, __p2_142) __extension__ ({ \
+  float64_t __s0_142 = __p0_142; \
+  float64x1_t __s1_142 = __p1_142; \
+  float64_t __ret_142; \
+  __ret_142 = __s0_142 * vget_lane_f64(__s1_142, __p2_142); \
+  __ret_142; \
 })
 #else
-#define vmuld_lane_f64(__p0_131, __p1_131, __p2_131) __extension__ ({ \
-  float64_t __s0_131 = __p0_131; \
-  float64x1_t __s1_131 = __p1_131; \
-  float64_t __ret_131; \
-  __ret_131 = __s0_131 * __noswap_vget_lane_f64(__s1_131, __p2_131); \
-  __ret_131; \
+#define vmuld_lane_f64(__p0_143, __p1_143, __p2_143) __extension__ ({ \
+  float64_t __s0_143 = __p0_143; \
+  float64x1_t __s1_143 = __p1_143; \
+  float64_t __ret_143; \
+  __ret_143 = __s0_143 * __noswap_vget_lane_f64(__s1_143, __p2_143); \
+  __ret_143; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmuls_lane_f32(__p0_132, __p1_132, __p2_132) __extension__ ({ \
-  float32_t __s0_132 = __p0_132; \
-  float32x2_t __s1_132 = __p1_132; \
-  float32_t __ret_132; \
-  __ret_132 = __s0_132 * vget_lane_f32(__s1_132, __p2_132); \
-  __ret_132; \
+#define vmuls_lane_f32(__p0_144, __p1_144, __p2_144) __extension__ ({ \
+  float32_t __s0_144 = __p0_144; \
+  float32x2_t __s1_144 = __p1_144; \
+  float32_t __ret_144; \
+  __ret_144 = __s0_144 * vget_lane_f32(__s1_144, __p2_144); \
+  __ret_144; \
 })
 #else
-#define vmuls_lane_f32(__p0_133, __p1_133, __p2_133) __extension__ ({ \
-  float32_t __s0_133 = __p0_133; \
-  float32x2_t __s1_133 = __p1_133; \
-  float32x2_t __rev1_133;  __rev1_133 = __builtin_shufflevector(__s1_133, __s1_133, 1, 0); \
-  float32_t __ret_133; \
-  __ret_133 = __s0_133 * __noswap_vget_lane_f32(__rev1_133, __p2_133); \
-  __ret_133; \
+#define vmuls_lane_f32(__p0_145, __p1_145, __p2_145) __extension__ ({ \
+  float32_t __s0_145 = __p0_145; \
+  float32x2_t __s1_145 = __p1_145; \
+  float32x2_t __rev1_145;  __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 1, 0); \
+  float32_t __ret_145; \
+  __ret_145 = __s0_145 * __noswap_vget_lane_f32(__rev1_145, __p2_145); \
+  __ret_145; \
 })
 #endif
 
@@ -53873,40 +58823,40 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmuld_laneq_f64(__p0_134, __p1_134, __p2_134) __extension__ ({ \
-  float64_t __s0_134 = __p0_134; \
-  float64x2_t __s1_134 = __p1_134; \
-  float64_t __ret_134; \
-  __ret_134 = __s0_134 * vgetq_lane_f64(__s1_134, __p2_134); \
-  __ret_134; \
+#define vmuld_laneq_f64(__p0_146, __p1_146, __p2_146) __extension__ ({ \
+  float64_t __s0_146 = __p0_146; \
+  float64x2_t __s1_146 = __p1_146; \
+  float64_t __ret_146; \
+  __ret_146 = __s0_146 * vgetq_lane_f64(__s1_146, __p2_146); \
+  __ret_146; \
 })
 #else
-#define vmuld_laneq_f64(__p0_135, __p1_135, __p2_135) __extension__ ({ \
-  float64_t __s0_135 = __p0_135; \
-  float64x2_t __s1_135 = __p1_135; \
-  float64x2_t __rev1_135;  __rev1_135 = __builtin_shufflevector(__s1_135, __s1_135, 1, 0); \
-  float64_t __ret_135; \
-  __ret_135 = __s0_135 * __noswap_vgetq_lane_f64(__rev1_135, __p2_135); \
-  __ret_135; \
+#define vmuld_laneq_f64(__p0_147, __p1_147, __p2_147) __extension__ ({ \
+  float64_t __s0_147 = __p0_147; \
+  float64x2_t __s1_147 = __p1_147; \
+  float64x2_t __rev1_147;  __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \
+  float64_t __ret_147; \
+  __ret_147 = __s0_147 * __noswap_vgetq_lane_f64(__rev1_147, __p2_147); \
+  __ret_147; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmuls_laneq_f32(__p0_136, __p1_136, __p2_136) __extension__ ({ \
-  float32_t __s0_136 = __p0_136; \
-  float32x4_t __s1_136 = __p1_136; \
-  float32_t __ret_136; \
-  __ret_136 = __s0_136 * vgetq_lane_f32(__s1_136, __p2_136); \
-  __ret_136; \
+#define vmuls_laneq_f32(__p0_148, __p1_148, __p2_148) __extension__ ({ \
+  float32_t __s0_148 = __p0_148; \
+  float32x4_t __s1_148 = __p1_148; \
+  float32_t __ret_148; \
+  __ret_148 = __s0_148 * vgetq_lane_f32(__s1_148, __p2_148); \
+  __ret_148; \
 })
 #else
-#define vmuls_laneq_f32(__p0_137, __p1_137, __p2_137) __extension__ ({ \
-  float32_t __s0_137 = __p0_137; \
-  float32x4_t __s1_137 = __p1_137; \
-  float32x4_t __rev1_137;  __rev1_137 = __builtin_shufflevector(__s1_137, __s1_137, 3, 2, 1, 0); \
-  float32_t __ret_137; \
-  __ret_137 = __s0_137 * __noswap_vgetq_lane_f32(__rev1_137, __p2_137); \
-  __ret_137; \
+#define vmuls_laneq_f32(__p0_149, __p1_149, __p2_149) __extension__ ({ \
+  float32_t __s0_149 = __p0_149; \
+  float32x4_t __s1_149 = __p1_149; \
+  float32x4_t __rev1_149;  __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \
+  float32_t __ret_149; \
+  __ret_149 = __s0_149 * __noswap_vgetq_lane_f32(__rev1_149, __p2_149); \
+  __ret_149; \
 })
 #endif
 
@@ -54779,39 +59729,39 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxd_lane_f64(__p0_138, __p1_138, __p2_138) __extension__ ({ \
-  float64_t __s0_138 = __p0_138; \
-  float64x1_t __s1_138 = __p1_138; \
-  float64_t __ret_138; \
-  __ret_138 = vmulxd_f64(__s0_138, vget_lane_f64(__s1_138, __p2_138)); \
-  __ret_138; \
+#define vmulxd_lane_f64(__p0_150, __p1_150, __p2_150) __extension__ ({ \
+  float64_t __s0_150 = __p0_150; \
+  float64x1_t __s1_150 = __p1_150; \
+  float64_t __ret_150; \
+  __ret_150 = vmulxd_f64(__s0_150, vget_lane_f64(__s1_150, __p2_150)); \
+  __ret_150; \
 })
 #else
-#define vmulxd_lane_f64(__p0_139, __p1_139, __p2_139) __extension__ ({ \
-  float64_t __s0_139 = __p0_139; \
-  float64x1_t __s1_139 = __p1_139; \
-  float64_t __ret_139; \
-  __ret_139 = __noswap_vmulxd_f64(__s0_139, __noswap_vget_lane_f64(__s1_139, __p2_139)); \
-  __ret_139; \
+#define vmulxd_lane_f64(__p0_151, __p1_151, __p2_151) __extension__ ({ \
+  float64_t __s0_151 = __p0_151; \
+  float64x1_t __s1_151 = __p1_151; \
+  float64_t __ret_151; \
+  __ret_151 = __noswap_vmulxd_f64(__s0_151, __noswap_vget_lane_f64(__s1_151, __p2_151)); \
+  __ret_151; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxs_lane_f32(__p0_140, __p1_140, __p2_140) __extension__ ({ \
-  float32_t __s0_140 = __p0_140; \
-  float32x2_t __s1_140 = __p1_140; \
-  float32_t __ret_140; \
-  __ret_140 = vmulxs_f32(__s0_140, vget_lane_f32(__s1_140, __p2_140)); \
-  __ret_140; \
+#define vmulxs_lane_f32(__p0_152, __p1_152, __p2_152) __extension__ ({ \
+  float32_t __s0_152 = __p0_152; \
+  float32x2_t __s1_152 = __p1_152; \
+  float32_t __ret_152; \
+  __ret_152 = vmulxs_f32(__s0_152, vget_lane_f32(__s1_152, __p2_152)); \
+  __ret_152; \
 })
 #else
-#define vmulxs_lane_f32(__p0_141, __p1_141, __p2_141) __extension__ ({ \
-  float32_t __s0_141 = __p0_141; \
-  float32x2_t __s1_141 = __p1_141; \
-  float32x2_t __rev1_141;  __rev1_141 = __builtin_shufflevector(__s1_141, __s1_141, 1, 0); \
-  float32_t __ret_141; \
-  __ret_141 = __noswap_vmulxs_f32(__s0_141, __noswap_vget_lane_f32(__rev1_141, __p2_141)); \
-  __ret_141; \
+#define vmulxs_lane_f32(__p0_153, __p1_153, __p2_153) __extension__ ({ \
+  float32_t __s0_153 = __p0_153; \
+  float32x2_t __s1_153 = __p1_153; \
+  float32x2_t __rev1_153;  __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 1, 0); \
+  float32_t __ret_153; \
+  __ret_153 = __noswap_vmulxs_f32(__s0_153, __noswap_vget_lane_f32(__rev1_153, __p2_153)); \
+  __ret_153; \
 })
 #endif
 
@@ -54878,40 +59828,40 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxd_laneq_f64(__p0_142, __p1_142, __p2_142) __extension__ ({ \
-  float64_t __s0_142 = __p0_142; \
-  float64x2_t __s1_142 = __p1_142; \
-  float64_t __ret_142; \
-  __ret_142 = vmulxd_f64(__s0_142, vgetq_lane_f64(__s1_142, __p2_142)); \
-  __ret_142; \
+#define vmulxd_laneq_f64(__p0_154, __p1_154, __p2_154) __extension__ ({ \
+  float64_t __s0_154 = __p0_154; \
+  float64x2_t __s1_154 = __p1_154; \
+  float64_t __ret_154; \
+  __ret_154 = vmulxd_f64(__s0_154, vgetq_lane_f64(__s1_154, __p2_154)); \
+  __ret_154; \
 })
 #else
-#define vmulxd_laneq_f64(__p0_143, __p1_143, __p2_143) __extension__ ({ \
-  float64_t __s0_143 = __p0_143; \
-  float64x2_t __s1_143 = __p1_143; \
-  float64x2_t __rev1_143;  __rev1_143 = __builtin_shufflevector(__s1_143, __s1_143, 1, 0); \
-  float64_t __ret_143; \
-  __ret_143 = __noswap_vmulxd_f64(__s0_143, __noswap_vgetq_lane_f64(__rev1_143, __p2_143)); \
-  __ret_143; \
+#define vmulxd_laneq_f64(__p0_155, __p1_155, __p2_155) __extension__ ({ \
+  float64_t __s0_155 = __p0_155; \
+  float64x2_t __s1_155 = __p1_155; \
+  float64x2_t __rev1_155;  __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 1, 0); \
+  float64_t __ret_155; \
+  __ret_155 = __noswap_vmulxd_f64(__s0_155, __noswap_vgetq_lane_f64(__rev1_155, __p2_155)); \
+  __ret_155; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxs_laneq_f32(__p0_144, __p1_144, __p2_144) __extension__ ({ \
-  float32_t __s0_144 = __p0_144; \
-  float32x4_t __s1_144 = __p1_144; \
-  float32_t __ret_144; \
-  __ret_144 = vmulxs_f32(__s0_144, vgetq_lane_f32(__s1_144, __p2_144)); \
-  __ret_144; \
+#define vmulxs_laneq_f32(__p0_156, __p1_156, __p2_156) __extension__ ({ \
+  float32_t __s0_156 = __p0_156; \
+  float32x4_t __s1_156 = __p1_156; \
+  float32_t __ret_156; \
+  __ret_156 = vmulxs_f32(__s0_156, vgetq_lane_f32(__s1_156, __p2_156)); \
+  __ret_156; \
 })
 #else
-#define vmulxs_laneq_f32(__p0_145, __p1_145, __p2_145) __extension__ ({ \
-  float32_t __s0_145 = __p0_145; \
-  float32x4_t __s1_145 = __p1_145; \
-  float32x4_t __rev1_145;  __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 3, 2, 1, 0); \
-  float32_t __ret_145; \
-  __ret_145 = __noswap_vmulxs_f32(__s0_145, __noswap_vgetq_lane_f32(__rev1_145, __p2_145)); \
-  __ret_145; \
+#define vmulxs_laneq_f32(__p0_157, __p1_157, __p2_157) __extension__ ({ \
+  float32_t __s0_157 = __p0_157; \
+  float32x4_t __s1_157 = __p1_157; \
+  float32x4_t __rev1_157;  __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 3, 2, 1, 0); \
+  float32_t __ret_157; \
+  __ret_157 = __noswap_vmulxs_f32(__s0_157, __noswap_vgetq_lane_f32(__rev1_157, __p2_157)); \
+  __ret_157; \
 })
 #endif
 
@@ -56675,78 +61625,78 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulhs_lane_s32(__p0_146, __p1_146, __p2_146) __extension__ ({ \
-  int32_t __s0_146 = __p0_146; \
-  int32x2_t __s1_146 = __p1_146; \
-  int32_t __ret_146; \
-  __ret_146 = vqdmulhs_s32(__s0_146, vget_lane_s32(__s1_146, __p2_146)); \
-  __ret_146; \
+#define vqdmulhs_lane_s32(__p0_158, __p1_158, __p2_158) __extension__ ({ \
+  int32_t __s0_158 = __p0_158; \
+  int32x2_t __s1_158 = __p1_158; \
+  int32_t __ret_158; \
+  __ret_158 = vqdmulhs_s32(__s0_158, vget_lane_s32(__s1_158, __p2_158)); \
+  __ret_158; \
 })
 #else
-#define vqdmulhs_lane_s32(__p0_147, __p1_147, __p2_147) __extension__ ({ \
-  int32_t __s0_147 = __p0_147; \
-  int32x2_t __s1_147 = __p1_147; \
-  int32x2_t __rev1_147;  __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \
-  int32_t __ret_147; \
-  __ret_147 = __noswap_vqdmulhs_s32(__s0_147, __noswap_vget_lane_s32(__rev1_147, __p2_147)); \
-  __ret_147; \
+#define vqdmulhs_lane_s32(__p0_159, __p1_159, __p2_159) __extension__ ({ \
+  int32_t __s0_159 = __p0_159; \
+  int32x2_t __s1_159 = __p1_159; \
+  int32x2_t __rev1_159;  __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 1, 0); \
+  int32_t __ret_159; \
+  __ret_159 = __noswap_vqdmulhs_s32(__s0_159, __noswap_vget_lane_s32(__rev1_159, __p2_159)); \
+  __ret_159; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulhh_lane_s16(__p0_148, __p1_148, __p2_148) __extension__ ({ \
-  int16_t __s0_148 = __p0_148; \
-  int16x4_t __s1_148 = __p1_148; \
-  int16_t __ret_148; \
-  __ret_148 = vqdmulhh_s16(__s0_148, vget_lane_s16(__s1_148, __p2_148)); \
-  __ret_148; \
+#define vqdmulhh_lane_s16(__p0_160, __p1_160, __p2_160) __extension__ ({ \
+  int16_t __s0_160 = __p0_160; \
+  int16x4_t __s1_160 = __p1_160; \
+  int16_t __ret_160; \
+  __ret_160 = vqdmulhh_s16(__s0_160, vget_lane_s16(__s1_160, __p2_160)); \
+  __ret_160; \
 })
 #else
-#define vqdmulhh_lane_s16(__p0_149, __p1_149, __p2_149) __extension__ ({ \
-  int16_t __s0_149 = __p0_149; \
-  int16x4_t __s1_149 = __p1_149; \
-  int16x4_t __rev1_149;  __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \
-  int16_t __ret_149; \
-  __ret_149 = __noswap_vqdmulhh_s16(__s0_149, __noswap_vget_lane_s16(__rev1_149, __p2_149)); \
-  __ret_149; \
+#define vqdmulhh_lane_s16(__p0_161, __p1_161, __p2_161) __extension__ ({ \
+  int16_t __s0_161 = __p0_161; \
+  int16x4_t __s1_161 = __p1_161; \
+  int16x4_t __rev1_161;  __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 3, 2, 1, 0); \
+  int16_t __ret_161; \
+  __ret_161 = __noswap_vqdmulhh_s16(__s0_161, __noswap_vget_lane_s16(__rev1_161, __p2_161)); \
+  __ret_161; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulhs_laneq_s32(__p0_150, __p1_150, __p2_150) __extension__ ({ \
-  int32_t __s0_150 = __p0_150; \
-  int32x4_t __s1_150 = __p1_150; \
-  int32_t __ret_150; \
-  __ret_150 = vqdmulhs_s32(__s0_150, vgetq_lane_s32(__s1_150, __p2_150)); \
-  __ret_150; \
+#define vqdmulhs_laneq_s32(__p0_162, __p1_162, __p2_162) __extension__ ({ \
+  int32_t __s0_162 = __p0_162; \
+  int32x4_t __s1_162 = __p1_162; \
+  int32_t __ret_162; \
+  __ret_162 = vqdmulhs_s32(__s0_162, vgetq_lane_s32(__s1_162, __p2_162)); \
+  __ret_162; \
 })
 #else
-#define vqdmulhs_laneq_s32(__p0_151, __p1_151, __p2_151) __extension__ ({ \
-  int32_t __s0_151 = __p0_151; \
-  int32x4_t __s1_151 = __p1_151; \
-  int32x4_t __rev1_151;  __rev1_151 = __builtin_shufflevector(__s1_151, __s1_151, 3, 2, 1, 0); \
-  int32_t __ret_151; \
-  __ret_151 = __noswap_vqdmulhs_s32(__s0_151, __noswap_vgetq_lane_s32(__rev1_151, __p2_151)); \
-  __ret_151; \
+#define vqdmulhs_laneq_s32(__p0_163, __p1_163, __p2_163) __extension__ ({ \
+  int32_t __s0_163 = __p0_163; \
+  int32x4_t __s1_163 = __p1_163; \
+  int32x4_t __rev1_163;  __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 3, 2, 1, 0); \
+  int32_t __ret_163; \
+  __ret_163 = __noswap_vqdmulhs_s32(__s0_163, __noswap_vgetq_lane_s32(__rev1_163, __p2_163)); \
+  __ret_163; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulhh_laneq_s16(__p0_152, __p1_152, __p2_152) __extension__ ({ \
-  int16_t __s0_152 = __p0_152; \
-  int16x8_t __s1_152 = __p1_152; \
-  int16_t __ret_152; \
-  __ret_152 = vqdmulhh_s16(__s0_152, vgetq_lane_s16(__s1_152, __p2_152)); \
-  __ret_152; \
+#define vqdmulhh_laneq_s16(__p0_164, __p1_164, __p2_164) __extension__ ({ \
+  int16_t __s0_164 = __p0_164; \
+  int16x8_t __s1_164 = __p1_164; \
+  int16_t __ret_164; \
+  __ret_164 = vqdmulhh_s16(__s0_164, vgetq_lane_s16(__s1_164, __p2_164)); \
+  __ret_164; \
 })
 #else
-#define vqdmulhh_laneq_s16(__p0_153, __p1_153, __p2_153) __extension__ ({ \
-  int16_t __s0_153 = __p0_153; \
-  int16x8_t __s1_153 = __p1_153; \
-  int16x8_t __rev1_153;  __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16_t __ret_153; \
-  __ret_153 = __noswap_vqdmulhh_s16(__s0_153, __noswap_vgetq_lane_s16(__rev1_153, __p2_153)); \
-  __ret_153; \
+#define vqdmulhh_laneq_s16(__p0_165, __p1_165, __p2_165) __extension__ ({ \
+  int16_t __s0_165 = __p0_165; \
+  int16x8_t __s1_165 = __p1_165; \
+  int16x8_t __rev1_165;  __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_165; \
+  __ret_165 = __noswap_vqdmulhh_s16(__s0_165, __noswap_vgetq_lane_s16(__rev1_165, __p2_165)); \
+  __ret_165; \
 })
 #endif
 
@@ -57023,78 +61973,78 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulls_lane_s32(__p0_154, __p1_154, __p2_154) __extension__ ({ \
-  int32_t __s0_154 = __p0_154; \
-  int32x2_t __s1_154 = __p1_154; \
-  int64_t __ret_154; \
-  __ret_154 = vqdmulls_s32(__s0_154, vget_lane_s32(__s1_154, __p2_154)); \
-  __ret_154; \
+#define vqdmulls_lane_s32(__p0_166, __p1_166, __p2_166) __extension__ ({ \
+  int32_t __s0_166 = __p0_166; \
+  int32x2_t __s1_166 = __p1_166; \
+  int64_t __ret_166; \
+  __ret_166 = vqdmulls_s32(__s0_166, vget_lane_s32(__s1_166, __p2_166)); \
+  __ret_166; \
 })
 #else
-#define vqdmulls_lane_s32(__p0_155, __p1_155, __p2_155) __extension__ ({ \
-  int32_t __s0_155 = __p0_155; \
-  int32x2_t __s1_155 = __p1_155; \
-  int32x2_t __rev1_155;  __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 1, 0); \
-  int64_t __ret_155; \
-  __ret_155 = __noswap_vqdmulls_s32(__s0_155, __noswap_vget_lane_s32(__rev1_155, __p2_155)); \
-  __ret_155; \
+#define vqdmulls_lane_s32(__p0_167, __p1_167, __p2_167) __extension__ ({ \
+  int32_t __s0_167 = __p0_167; \
+  int32x2_t __s1_167 = __p1_167; \
+  int32x2_t __rev1_167;  __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 1, 0); \
+  int64_t __ret_167; \
+  __ret_167 = __noswap_vqdmulls_s32(__s0_167, __noswap_vget_lane_s32(__rev1_167, __p2_167)); \
+  __ret_167; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmullh_lane_s16(__p0_156, __p1_156, __p2_156) __extension__ ({ \
-  int16_t __s0_156 = __p0_156; \
-  int16x4_t __s1_156 = __p1_156; \
-  int32_t __ret_156; \
-  __ret_156 = vqdmullh_s16(__s0_156, vget_lane_s16(__s1_156, __p2_156)); \
-  __ret_156; \
+#define vqdmullh_lane_s16(__p0_168, __p1_168, __p2_168) __extension__ ({ \
+  int16_t __s0_168 = __p0_168; \
+  int16x4_t __s1_168 = __p1_168; \
+  int32_t __ret_168; \
+  __ret_168 = vqdmullh_s16(__s0_168, vget_lane_s16(__s1_168, __p2_168)); \
+  __ret_168; \
 })
 #else
-#define vqdmullh_lane_s16(__p0_157, __p1_157, __p2_157) __extension__ ({ \
-  int16_t __s0_157 = __p0_157; \
-  int16x4_t __s1_157 = __p1_157; \
-  int16x4_t __rev1_157;  __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 3, 2, 1, 0); \
-  int32_t __ret_157; \
-  __ret_157 = __noswap_vqdmullh_s16(__s0_157, __noswap_vget_lane_s16(__rev1_157, __p2_157)); \
-  __ret_157; \
+#define vqdmullh_lane_s16(__p0_169, __p1_169, __p2_169) __extension__ ({ \
+  int16_t __s0_169 = __p0_169; \
+  int16x4_t __s1_169 = __p1_169; \
+  int16x4_t __rev1_169;  __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 3, 2, 1, 0); \
+  int32_t __ret_169; \
+  __ret_169 = __noswap_vqdmullh_s16(__s0_169, __noswap_vget_lane_s16(__rev1_169, __p2_169)); \
+  __ret_169; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulls_laneq_s32(__p0_158, __p1_158, __p2_158) __extension__ ({ \
-  int32_t __s0_158 = __p0_158; \
-  int32x4_t __s1_158 = __p1_158; \
-  int64_t __ret_158; \
-  __ret_158 = vqdmulls_s32(__s0_158, vgetq_lane_s32(__s1_158, __p2_158)); \
-  __ret_158; \
+#define vqdmulls_laneq_s32(__p0_170, __p1_170, __p2_170) __extension__ ({ \
+  int32_t __s0_170 = __p0_170; \
+  int32x4_t __s1_170 = __p1_170; \
+  int64_t __ret_170; \
+  __ret_170 = vqdmulls_s32(__s0_170, vgetq_lane_s32(__s1_170, __p2_170)); \
+  __ret_170; \
 })
 #else
-#define vqdmulls_laneq_s32(__p0_159, __p1_159, __p2_159) __extension__ ({ \
-  int32_t __s0_159 = __p0_159; \
-  int32x4_t __s1_159 = __p1_159; \
-  int32x4_t __rev1_159;  __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 3, 2, 1, 0); \
-  int64_t __ret_159; \
-  __ret_159 = __noswap_vqdmulls_s32(__s0_159, __noswap_vgetq_lane_s32(__rev1_159, __p2_159)); \
-  __ret_159; \
+#define vqdmulls_laneq_s32(__p0_171, __p1_171, __p2_171) __extension__ ({ \
+  int32_t __s0_171 = __p0_171; \
+  int32x4_t __s1_171 = __p1_171; \
+  int32x4_t __rev1_171;  __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \
+  int64_t __ret_171; \
+  __ret_171 = __noswap_vqdmulls_s32(__s0_171, __noswap_vgetq_lane_s32(__rev1_171, __p2_171)); \
+  __ret_171; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmullh_laneq_s16(__p0_160, __p1_160, __p2_160) __extension__ ({ \
-  int16_t __s0_160 = __p0_160; \
-  int16x8_t __s1_160 = __p1_160; \
-  int32_t __ret_160; \
-  __ret_160 = vqdmullh_s16(__s0_160, vgetq_lane_s16(__s1_160, __p2_160)); \
-  __ret_160; \
+#define vqdmullh_laneq_s16(__p0_172, __p1_172, __p2_172) __extension__ ({ \
+  int16_t __s0_172 = __p0_172; \
+  int16x8_t __s1_172 = __p1_172; \
+  int32_t __ret_172; \
+  __ret_172 = vqdmullh_s16(__s0_172, vgetq_lane_s16(__s1_172, __p2_172)); \
+  __ret_172; \
 })
 #else
-#define vqdmullh_laneq_s16(__p0_161, __p1_161, __p2_161) __extension__ ({ \
-  int16_t __s0_161 = __p0_161; \
-  int16x8_t __s1_161 = __p1_161; \
-  int16x8_t __rev1_161;  __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32_t __ret_161; \
-  __ret_161 = __noswap_vqdmullh_s16(__s0_161, __noswap_vgetq_lane_s16(__rev1_161, __p2_161)); \
-  __ret_161; \
+#define vqdmullh_laneq_s16(__p0_173, __p1_173, __p2_173) __extension__ ({ \
+  int16_t __s0_173 = __p0_173; \
+  int16x8_t __s1_173 = __p1_173; \
+  int16x8_t __rev1_173;  __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32_t __ret_173; \
+  __ret_173 = __noswap_vqdmullh_s16(__s0_173, __noswap_vgetq_lane_s16(__rev1_173, __p2_173)); \
+  __ret_173; \
 })
 #endif
 
@@ -57544,78 +62494,78 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmulhs_lane_s32(__p0_162, __p1_162, __p2_162) __extension__ ({ \
-  int32_t __s0_162 = __p0_162; \
-  int32x2_t __s1_162 = __p1_162; \
-  int32_t __ret_162; \
-  __ret_162 = vqrdmulhs_s32(__s0_162, vget_lane_s32(__s1_162, __p2_162)); \
-  __ret_162; \
+#define vqrdmulhs_lane_s32(__p0_174, __p1_174, __p2_174) __extension__ ({ \
+  int32_t __s0_174 = __p0_174; \
+  int32x2_t __s1_174 = __p1_174; \
+  int32_t __ret_174; \
+  __ret_174 = vqrdmulhs_s32(__s0_174, vget_lane_s32(__s1_174, __p2_174)); \
+  __ret_174; \
 })
 #else
-#define vqrdmulhs_lane_s32(__p0_163, __p1_163, __p2_163) __extension__ ({ \
-  int32_t __s0_163 = __p0_163; \
-  int32x2_t __s1_163 = __p1_163; \
-  int32x2_t __rev1_163;  __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 1, 0); \
-  int32_t __ret_163; \
-  __ret_163 = __noswap_vqrdmulhs_s32(__s0_163, __noswap_vget_lane_s32(__rev1_163, __p2_163)); \
-  __ret_163; \
+#define vqrdmulhs_lane_s32(__p0_175, __p1_175, __p2_175) __extension__ ({ \
+  int32_t __s0_175 = __p0_175; \
+  int32x2_t __s1_175 = __p1_175; \
+  int32x2_t __rev1_175;  __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 1, 0); \
+  int32_t __ret_175; \
+  __ret_175 = __noswap_vqrdmulhs_s32(__s0_175, __noswap_vget_lane_s32(__rev1_175, __p2_175)); \
+  __ret_175; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmulhh_lane_s16(__p0_164, __p1_164, __p2_164) __extension__ ({ \
-  int16_t __s0_164 = __p0_164; \
-  int16x4_t __s1_164 = __p1_164; \
-  int16_t __ret_164; \
-  __ret_164 = vqrdmulhh_s16(__s0_164, vget_lane_s16(__s1_164, __p2_164)); \
-  __ret_164; \
+#define vqrdmulhh_lane_s16(__p0_176, __p1_176, __p2_176) __extension__ ({ \
+  int16_t __s0_176 = __p0_176; \
+  int16x4_t __s1_176 = __p1_176; \
+  int16_t __ret_176; \
+  __ret_176 = vqrdmulhh_s16(__s0_176, vget_lane_s16(__s1_176, __p2_176)); \
+  __ret_176; \
 })
 #else
-#define vqrdmulhh_lane_s16(__p0_165, __p1_165, __p2_165) __extension__ ({ \
-  int16_t __s0_165 = __p0_165; \
-  int16x4_t __s1_165 = __p1_165; \
-  int16x4_t __rev1_165;  __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 3, 2, 1, 0); \
-  int16_t __ret_165; \
-  __ret_165 = __noswap_vqrdmulhh_s16(__s0_165, __noswap_vget_lane_s16(__rev1_165, __p2_165)); \
-  __ret_165; \
+#define vqrdmulhh_lane_s16(__p0_177, __p1_177, __p2_177) __extension__ ({ \
+  int16_t __s0_177 = __p0_177; \
+  int16x4_t __s1_177 = __p1_177; \
+  int16x4_t __rev1_177;  __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 3, 2, 1, 0); \
+  int16_t __ret_177; \
+  __ret_177 = __noswap_vqrdmulhh_s16(__s0_177, __noswap_vget_lane_s16(__rev1_177, __p2_177)); \
+  __ret_177; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmulhs_laneq_s32(__p0_166, __p1_166, __p2_166) __extension__ ({ \
-  int32_t __s0_166 = __p0_166; \
-  int32x4_t __s1_166 = __p1_166; \
-  int32_t __ret_166; \
-  __ret_166 = vqrdmulhs_s32(__s0_166, vgetq_lane_s32(__s1_166, __p2_166)); \
-  __ret_166; \
+#define vqrdmulhs_laneq_s32(__p0_178, __p1_178, __p2_178) __extension__ ({ \
+  int32_t __s0_178 = __p0_178; \
+  int32x4_t __s1_178 = __p1_178; \
+  int32_t __ret_178; \
+  __ret_178 = vqrdmulhs_s32(__s0_178, vgetq_lane_s32(__s1_178, __p2_178)); \
+  __ret_178; \
 })
 #else
-#define vqrdmulhs_laneq_s32(__p0_167, __p1_167, __p2_167) __extension__ ({ \
-  int32_t __s0_167 = __p0_167; \
-  int32x4_t __s1_167 = __p1_167; \
-  int32x4_t __rev1_167;  __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 3, 2, 1, 0); \
-  int32_t __ret_167; \
-  __ret_167 = __noswap_vqrdmulhs_s32(__s0_167, __noswap_vgetq_lane_s32(__rev1_167, __p2_167)); \
-  __ret_167; \
+#define vqrdmulhs_laneq_s32(__p0_179, __p1_179, __p2_179) __extension__ ({ \
+  int32_t __s0_179 = __p0_179; \
+  int32x4_t __s1_179 = __p1_179; \
+  int32x4_t __rev1_179;  __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 3, 2, 1, 0); \
+  int32_t __ret_179; \
+  __ret_179 = __noswap_vqrdmulhs_s32(__s0_179, __noswap_vgetq_lane_s32(__rev1_179, __p2_179)); \
+  __ret_179; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmulhh_laneq_s16(__p0_168, __p1_168, __p2_168) __extension__ ({ \
-  int16_t __s0_168 = __p0_168; \
-  int16x8_t __s1_168 = __p1_168; \
-  int16_t __ret_168; \
-  __ret_168 = vqrdmulhh_s16(__s0_168, vgetq_lane_s16(__s1_168, __p2_168)); \
-  __ret_168; \
+#define vqrdmulhh_laneq_s16(__p0_180, __p1_180, __p2_180) __extension__ ({ \
+  int16_t __s0_180 = __p0_180; \
+  int16x8_t __s1_180 = __p1_180; \
+  int16_t __ret_180; \
+  __ret_180 = vqrdmulhh_s16(__s0_180, vgetq_lane_s16(__s1_180, __p2_180)); \
+  __ret_180; \
 })
 #else
-#define vqrdmulhh_laneq_s16(__p0_169, __p1_169, __p2_169) __extension__ ({ \
-  int16_t __s0_169 = __p0_169; \
-  int16x8_t __s1_169 = __p1_169; \
-  int16x8_t __rev1_169;  __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16_t __ret_169; \
-  __ret_169 = __noswap_vqrdmulhh_s16(__s0_169, __noswap_vgetq_lane_s16(__rev1_169, __p2_169)); \
-  __ret_169; \
+#define vqrdmulhh_laneq_s16(__p0_181, __p1_181, __p2_181) __extension__ ({ \
+  int16_t __s0_181 = __p0_181; \
+  int16x8_t __s1_181 = __p1_181; \
+  int16x8_t __rev1_181;  __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_181; \
+  __ret_181 = __noswap_vqrdmulhh_s16(__s0_181, __noswap_vgetq_lane_s16(__rev1_181, __p2_181)); \
+  __ret_181; \
 })
 #endif
 
@@ -57816,928 +62766,928 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u32(__p0_170, __p1_170, __p2_170) __extension__ ({ \
-  uint16x4_t __s0_170 = __p0_170; \
-  uint32x4_t __s1_170 = __p1_170; \
-  uint16x8_t __ret_170; \
-  __ret_170 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_170), (uint16x4_t)(vqrshrn_n_u32(__s1_170, __p2_170)))); \
-  __ret_170; \
-})
-#else
-#define vqrshrn_high_n_u32(__p0_171, __p1_171, __p2_171) __extension__ ({ \
-  uint16x4_t __s0_171 = __p0_171; \
-  uint32x4_t __s1_171 = __p1_171; \
-  uint16x4_t __rev0_171;  __rev0_171 = __builtin_shufflevector(__s0_171, __s0_171, 3, 2, 1, 0); \
-  uint32x4_t __rev1_171;  __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \
-  uint16x8_t __ret_171; \
-  __ret_171 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_171), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_171, __p2_171)))); \
-  __ret_171 = __builtin_shufflevector(__ret_171, __ret_171, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_171; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u64(__p0_172, __p1_172, __p2_172) __extension__ ({ \
-  uint32x2_t __s0_172 = __p0_172; \
-  uint64x2_t __s1_172 = __p1_172; \
-  uint32x4_t __ret_172; \
-  __ret_172 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_172), (uint32x2_t)(vqrshrn_n_u64(__s1_172, __p2_172)))); \
-  __ret_172; \
-})
-#else
-#define vqrshrn_high_n_u64(__p0_173, __p1_173, __p2_173) __extension__ ({ \
-  uint32x2_t __s0_173 = __p0_173; \
-  uint64x2_t __s1_173 = __p1_173; \
-  uint32x2_t __rev0_173;  __rev0_173 = __builtin_shufflevector(__s0_173, __s0_173, 1, 0); \
-  uint64x2_t __rev1_173;  __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 1, 0); \
-  uint32x4_t __ret_173; \
-  __ret_173 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_173), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_173, __p2_173)))); \
-  __ret_173 = __builtin_shufflevector(__ret_173, __ret_173, 3, 2, 1, 0); \
-  __ret_173; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u16(__p0_174, __p1_174, __p2_174) __extension__ ({ \
-  uint8x8_t __s0_174 = __p0_174; \
-  uint16x8_t __s1_174 = __p1_174; \
-  uint8x16_t __ret_174; \
-  __ret_174 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_174), (uint8x8_t)(vqrshrn_n_u16(__s1_174, __p2_174)))); \
-  __ret_174; \
-})
-#else
-#define vqrshrn_high_n_u16(__p0_175, __p1_175, __p2_175) __extension__ ({ \
-  uint8x8_t __s0_175 = __p0_175; \
-  uint16x8_t __s1_175 = __p1_175; \
-  uint8x8_t __rev0_175;  __rev0_175 = __builtin_shufflevector(__s0_175, __s0_175, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_175;  __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_175; \
-  __ret_175 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_175), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_175, __p2_175)))); \
-  __ret_175 = __builtin_shufflevector(__ret_175, __ret_175, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_175; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s32(__p0_176, __p1_176, __p2_176) __extension__ ({ \
-  int16x4_t __s0_176 = __p0_176; \
-  int32x4_t __s1_176 = __p1_176; \
-  int16x8_t __ret_176; \
-  __ret_176 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_176), (int16x4_t)(vqrshrn_n_s32(__s1_176, __p2_176)))); \
-  __ret_176; \
-})
-#else
-#define vqrshrn_high_n_s32(__p0_177, __p1_177, __p2_177) __extension__ ({ \
-  int16x4_t __s0_177 = __p0_177; \
-  int32x4_t __s1_177 = __p1_177; \
-  int16x4_t __rev0_177;  __rev0_177 = __builtin_shufflevector(__s0_177, __s0_177, 3, 2, 1, 0); \
-  int32x4_t __rev1_177;  __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 3, 2, 1, 0); \
-  int16x8_t __ret_177; \
-  __ret_177 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_177), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_177, __p2_177)))); \
-  __ret_177 = __builtin_shufflevector(__ret_177, __ret_177, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_177; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s64(__p0_178, __p1_178, __p2_178) __extension__ ({ \
-  int32x2_t __s0_178 = __p0_178; \
-  int64x2_t __s1_178 = __p1_178; \
-  int32x4_t __ret_178; \
-  __ret_178 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_178), (int32x2_t)(vqrshrn_n_s64(__s1_178, __p2_178)))); \
-  __ret_178; \
-})
-#else
-#define vqrshrn_high_n_s64(__p0_179, __p1_179, __p2_179) __extension__ ({ \
-  int32x2_t __s0_179 = __p0_179; \
-  int64x2_t __s1_179 = __p1_179; \
-  int32x2_t __rev0_179;  __rev0_179 = __builtin_shufflevector(__s0_179, __s0_179, 1, 0); \
-  int64x2_t __rev1_179;  __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 1, 0); \
-  int32x4_t __ret_179; \
-  __ret_179 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_179), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_179, __p2_179)))); \
-  __ret_179 = __builtin_shufflevector(__ret_179, __ret_179, 3, 2, 1, 0); \
-  __ret_179; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s16(__p0_180, __p1_180, __p2_180) __extension__ ({ \
-  int8x8_t __s0_180 = __p0_180; \
-  int16x8_t __s1_180 = __p1_180; \
-  int8x16_t __ret_180; \
-  __ret_180 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_180), (int8x8_t)(vqrshrn_n_s16(__s1_180, __p2_180)))); \
-  __ret_180; \
-})
-#else
-#define vqrshrn_high_n_s16(__p0_181, __p1_181, __p2_181) __extension__ ({ \
-  int8x8_t __s0_181 = __p0_181; \
-  int16x8_t __s1_181 = __p1_181; \
-  int8x8_t __rev0_181;  __rev0_181 = __builtin_shufflevector(__s0_181, __s0_181, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_181;  __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_181; \
-  __ret_181 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_181), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_181, __p2_181)))); \
-  __ret_181 = __builtin_shufflevector(__ret_181, __ret_181, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_181; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __s0 = __p0; \
-  uint16_t __ret; \
-  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __s0 = __p0; \
-  uint16_t __ret; \
-  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __s0 = __p0; \
-  uint32_t __ret; \
-  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __s0 = __p0; \
-  uint32_t __ret; \
-  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __s0 = __p0; \
-  uint8_t __ret; \
-  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __s0 = __p0; \
-  uint8_t __ret; \
-  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __s0 = __p0; \
-  int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __s0 = __p0; \
-  int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __s0 = __p0; \
-  int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __s0 = __p0; \
-  int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s32(__p0_182, __p1_182, __p2_182) __extension__ ({ \
-  int16x4_t __s0_182 = __p0_182; \
-  int32x4_t __s1_182 = __p1_182; \
-  int16x8_t __ret_182; \
-  __ret_182 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_182), (int16x4_t)(vqrshrun_n_s32(__s1_182, __p2_182)))); \
+#define vqrshrn_high_n_u32(__p0_182, __p1_182, __p2_182) __extension__ ({ \
+  uint16x4_t __s0_182 = __p0_182; \
+  uint32x4_t __s1_182 = __p1_182; \
+  uint16x8_t __ret_182; \
+  __ret_182 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_182), (uint16x4_t)(vqrshrn_n_u32(__s1_182, __p2_182)))); \
   __ret_182; \
 })
 #else
-#define vqrshrun_high_n_s32(__p0_183, __p1_183, __p2_183) __extension__ ({ \
-  int16x4_t __s0_183 = __p0_183; \
-  int32x4_t __s1_183 = __p1_183; \
-  int16x4_t __rev0_183;  __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 3, 2, 1, 0); \
-  int32x4_t __rev1_183;  __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 3, 2, 1, 0); \
-  int16x8_t __ret_183; \
-  __ret_183 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_183), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_183, __p2_183)))); \
+#define vqrshrn_high_n_u32(__p0_183, __p1_183, __p2_183) __extension__ ({ \
+  uint16x4_t __s0_183 = __p0_183; \
+  uint32x4_t __s1_183 = __p1_183; \
+  uint16x4_t __rev0_183;  __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 3, 2, 1, 0); \
+  uint32x4_t __rev1_183;  __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 3, 2, 1, 0); \
+  uint16x8_t __ret_183; \
+  __ret_183 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_183), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_183, __p2_183)))); \
   __ret_183 = __builtin_shufflevector(__ret_183, __ret_183, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_183; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s64(__p0_184, __p1_184, __p2_184) __extension__ ({ \
-  int32x2_t __s0_184 = __p0_184; \
-  int64x2_t __s1_184 = __p1_184; \
-  int32x4_t __ret_184; \
-  __ret_184 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_184), (int32x2_t)(vqrshrun_n_s64(__s1_184, __p2_184)))); \
+#define vqrshrn_high_n_u64(__p0_184, __p1_184, __p2_184) __extension__ ({ \
+  uint32x2_t __s0_184 = __p0_184; \
+  uint64x2_t __s1_184 = __p1_184; \
+  uint32x4_t __ret_184; \
+  __ret_184 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_184), (uint32x2_t)(vqrshrn_n_u64(__s1_184, __p2_184)))); \
   __ret_184; \
 })
 #else
-#define vqrshrun_high_n_s64(__p0_185, __p1_185, __p2_185) __extension__ ({ \
-  int32x2_t __s0_185 = __p0_185; \
-  int64x2_t __s1_185 = __p1_185; \
-  int32x2_t __rev0_185;  __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 1, 0); \
-  int64x2_t __rev1_185;  __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 1, 0); \
-  int32x4_t __ret_185; \
-  __ret_185 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_185), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_185, __p2_185)))); \
+#define vqrshrn_high_n_u64(__p0_185, __p1_185, __p2_185) __extension__ ({ \
+  uint32x2_t __s0_185 = __p0_185; \
+  uint64x2_t __s1_185 = __p1_185; \
+  uint32x2_t __rev0_185;  __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 1, 0); \
+  uint64x2_t __rev1_185;  __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 1, 0); \
+  uint32x4_t __ret_185; \
+  __ret_185 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_185), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_185, __p2_185)))); \
   __ret_185 = __builtin_shufflevector(__ret_185, __ret_185, 3, 2, 1, 0); \
   __ret_185; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s16(__p0_186, __p1_186, __p2_186) __extension__ ({ \
-  int8x8_t __s0_186 = __p0_186; \
-  int16x8_t __s1_186 = __p1_186; \
-  int8x16_t __ret_186; \
-  __ret_186 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_186), (int8x8_t)(vqrshrun_n_s16(__s1_186, __p2_186)))); \
+#define vqrshrn_high_n_u16(__p0_186, __p1_186, __p2_186) __extension__ ({ \
+  uint8x8_t __s0_186 = __p0_186; \
+  uint16x8_t __s1_186 = __p1_186; \
+  uint8x16_t __ret_186; \
+  __ret_186 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_186), (uint8x8_t)(vqrshrn_n_u16(__s1_186, __p2_186)))); \
   __ret_186; \
 })
 #else
-#define vqrshrun_high_n_s16(__p0_187, __p1_187, __p2_187) __extension__ ({ \
-  int8x8_t __s0_187 = __p0_187; \
-  int16x8_t __s1_187 = __p1_187; \
-  int8x8_t __rev0_187;  __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_187;  __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_187; \
-  __ret_187 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_187), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_187, __p2_187)))); \
+#define vqrshrn_high_n_u16(__p0_187, __p1_187, __p2_187) __extension__ ({ \
+  uint8x8_t __s0_187 = __p0_187; \
+  uint16x8_t __s1_187 = __p1_187; \
+  uint8x8_t __rev0_187;  __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_187;  __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_187; \
+  __ret_187 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_187), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_187, __p2_187)))); \
   __ret_187 = __builtin_shufflevector(__ret_187, __ret_187, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_187; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __s0 = __p0; \
-  int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __s0 = __p0; \
-  int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __s0 = __p0; \
-  int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __s0 = __p0; \
-  int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
-  return __ret;
-}
-#else
-__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
-  return __ret;
-}
-#else
-__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
-  return __ret;
-}
-#else
-__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
-  return __ret;
-}
-#else
-__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
-  return __ret;
-}
-#else
-__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
-  return __ret;
-}
-#else
-__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
-  return __ret;
-}
-#else
-__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
-  return __ret;
-}
-#else
-__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __s0 = __p0; \
-  uint8_t __ret; \
-  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __s0 = __p0; \
-  uint8_t __ret; \
-  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __s0 = __p0; \
-  uint32_t __ret; \
-  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __s0 = __p0; \
-  uint32_t __ret; \
-  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __s0 = __p0; \
-  uint64_t __ret; \
-  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __s0 = __p0; \
-  uint64_t __ret; \
-  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __s0 = __p0; \
-  uint16_t __ret; \
-  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __s0 = __p0; \
-  uint16_t __ret; \
-  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
-  int8_t __s0 = __p0; \
-  int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
-  int8_t __s0 = __p0; \
-  int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __s0 = __p0; \
-  int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __s0 = __p0; \
-  int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int64_t __ret; \
-  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int64_t __ret; \
-  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __s0 = __p0; \
-  int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __s0 = __p0; \
-  int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
-  int8_t __s0 = __p0; \
-  int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
-  int8_t __s0 = __p0; \
-  int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __s0 = __p0; \
-  int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __s0 = __p0; \
-  int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int64_t __ret; \
-  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int64_t __ret; \
-  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __s0 = __p0; \
-  int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __s0 = __p0; \
-  int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u32(__p0_188, __p1_188, __p2_188) __extension__ ({ \
-  uint16x4_t __s0_188 = __p0_188; \
-  uint32x4_t __s1_188 = __p1_188; \
-  uint16x8_t __ret_188; \
-  __ret_188 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_188), (uint16x4_t)(vqshrn_n_u32(__s1_188, __p2_188)))); \
+#define vqrshrn_high_n_s32(__p0_188, __p1_188, __p2_188) __extension__ ({ \
+  int16x4_t __s0_188 = __p0_188; \
+  int32x4_t __s1_188 = __p1_188; \
+  int16x8_t __ret_188; \
+  __ret_188 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_188), (int16x4_t)(vqrshrn_n_s32(__s1_188, __p2_188)))); \
   __ret_188; \
 })
 #else
-#define vqshrn_high_n_u32(__p0_189, __p1_189, __p2_189) __extension__ ({ \
-  uint16x4_t __s0_189 = __p0_189; \
-  uint32x4_t __s1_189 = __p1_189; \
-  uint16x4_t __rev0_189;  __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \
-  uint32x4_t __rev1_189;  __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \
-  uint16x8_t __ret_189; \
-  __ret_189 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_189), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_189, __p2_189)))); \
+#define vqrshrn_high_n_s32(__p0_189, __p1_189, __p2_189) __extension__ ({ \
+  int16x4_t __s0_189 = __p0_189; \
+  int32x4_t __s1_189 = __p1_189; \
+  int16x4_t __rev0_189;  __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \
+  int32x4_t __rev1_189;  __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \
+  int16x8_t __ret_189; \
+  __ret_189 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_189), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_189, __p2_189)))); \
   __ret_189 = __builtin_shufflevector(__ret_189, __ret_189, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_189; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u64(__p0_190, __p1_190, __p2_190) __extension__ ({ \
-  uint32x2_t __s0_190 = __p0_190; \
-  uint64x2_t __s1_190 = __p1_190; \
-  uint32x4_t __ret_190; \
-  __ret_190 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_190), (uint32x2_t)(vqshrn_n_u64(__s1_190, __p2_190)))); \
+#define vqrshrn_high_n_s64(__p0_190, __p1_190, __p2_190) __extension__ ({ \
+  int32x2_t __s0_190 = __p0_190; \
+  int64x2_t __s1_190 = __p1_190; \
+  int32x4_t __ret_190; \
+  __ret_190 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_190), (int32x2_t)(vqrshrn_n_s64(__s1_190, __p2_190)))); \
   __ret_190; \
 })
 #else
-#define vqshrn_high_n_u64(__p0_191, __p1_191, __p2_191) __extension__ ({ \
-  uint32x2_t __s0_191 = __p0_191; \
-  uint64x2_t __s1_191 = __p1_191; \
-  uint32x2_t __rev0_191;  __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \
-  uint64x2_t __rev1_191;  __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \
-  uint32x4_t __ret_191; \
-  __ret_191 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_191), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_191, __p2_191)))); \
+#define vqrshrn_high_n_s64(__p0_191, __p1_191, __p2_191) __extension__ ({ \
+  int32x2_t __s0_191 = __p0_191; \
+  int64x2_t __s1_191 = __p1_191; \
+  int32x2_t __rev0_191;  __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \
+  int64x2_t __rev1_191;  __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \
+  int32x4_t __ret_191; \
+  __ret_191 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_191), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_191, __p2_191)))); \
   __ret_191 = __builtin_shufflevector(__ret_191, __ret_191, 3, 2, 1, 0); \
   __ret_191; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u16(__p0_192, __p1_192, __p2_192) __extension__ ({ \
-  uint8x8_t __s0_192 = __p0_192; \
-  uint16x8_t __s1_192 = __p1_192; \
-  uint8x16_t __ret_192; \
-  __ret_192 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_192), (uint8x8_t)(vqshrn_n_u16(__s1_192, __p2_192)))); \
+#define vqrshrn_high_n_s16(__p0_192, __p1_192, __p2_192) __extension__ ({ \
+  int8x8_t __s0_192 = __p0_192; \
+  int16x8_t __s1_192 = __p1_192; \
+  int8x16_t __ret_192; \
+  __ret_192 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_192), (int8x8_t)(vqrshrn_n_s16(__s1_192, __p2_192)))); \
   __ret_192; \
 })
 #else
-#define vqshrn_high_n_u16(__p0_193, __p1_193, __p2_193) __extension__ ({ \
-  uint8x8_t __s0_193 = __p0_193; \
-  uint16x8_t __s1_193 = __p1_193; \
-  uint8x8_t __rev0_193;  __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_193;  __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_193; \
-  __ret_193 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_193), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_193, __p2_193)))); \
+#define vqrshrn_high_n_s16(__p0_193, __p1_193, __p2_193) __extension__ ({ \
+  int8x8_t __s0_193 = __p0_193; \
+  int16x8_t __s1_193 = __p1_193; \
+  int8x8_t __rev0_193;  __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_193;  __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_193; \
+  __ret_193 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_193), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_193, __p2_193)))); \
   __ret_193 = __builtin_shufflevector(__ret_193, __ret_193, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_193; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s32(__p0_194, __p1_194, __p2_194) __extension__ ({ \
+#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_high_n_s32(__p0_194, __p1_194, __p2_194) __extension__ ({ \
   int16x4_t __s0_194 = __p0_194; \
   int32x4_t __s1_194 = __p1_194; \
   int16x8_t __ret_194; \
-  __ret_194 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_194), (int16x4_t)(vqshrn_n_s32(__s1_194, __p2_194)))); \
+  __ret_194 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_194), (int16x4_t)(vqrshrun_n_s32(__s1_194, __p2_194)))); \
   __ret_194; \
 })
 #else
-#define vqshrn_high_n_s32(__p0_195, __p1_195, __p2_195) __extension__ ({ \
+#define vqrshrun_high_n_s32(__p0_195, __p1_195, __p2_195) __extension__ ({ \
   int16x4_t __s0_195 = __p0_195; \
   int32x4_t __s1_195 = __p1_195; \
   int16x4_t __rev0_195;  __rev0_195 = __builtin_shufflevector(__s0_195, __s0_195, 3, 2, 1, 0); \
   int32x4_t __rev1_195;  __rev1_195 = __builtin_shufflevector(__s1_195, __s1_195, 3, 2, 1, 0); \
   int16x8_t __ret_195; \
-  __ret_195 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_195), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_195, __p2_195)))); \
+  __ret_195 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_195), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_195, __p2_195)))); \
   __ret_195 = __builtin_shufflevector(__ret_195, __ret_195, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_195; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s64(__p0_196, __p1_196, __p2_196) __extension__ ({ \
+#define vqrshrun_high_n_s64(__p0_196, __p1_196, __p2_196) __extension__ ({ \
   int32x2_t __s0_196 = __p0_196; \
   int64x2_t __s1_196 = __p1_196; \
   int32x4_t __ret_196; \
-  __ret_196 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_196), (int32x2_t)(vqshrn_n_s64(__s1_196, __p2_196)))); \
+  __ret_196 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_196), (int32x2_t)(vqrshrun_n_s64(__s1_196, __p2_196)))); \
   __ret_196; \
 })
 #else
-#define vqshrn_high_n_s64(__p0_197, __p1_197, __p2_197) __extension__ ({ \
+#define vqrshrun_high_n_s64(__p0_197, __p1_197, __p2_197) __extension__ ({ \
   int32x2_t __s0_197 = __p0_197; \
   int64x2_t __s1_197 = __p1_197; \
   int32x2_t __rev0_197;  __rev0_197 = __builtin_shufflevector(__s0_197, __s0_197, 1, 0); \
   int64x2_t __rev1_197;  __rev1_197 = __builtin_shufflevector(__s1_197, __s1_197, 1, 0); \
   int32x4_t __ret_197; \
-  __ret_197 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_197), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_197, __p2_197)))); \
+  __ret_197 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_197), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_197, __p2_197)))); \
   __ret_197 = __builtin_shufflevector(__ret_197, __ret_197, 3, 2, 1, 0); \
   __ret_197; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s16(__p0_198, __p1_198, __p2_198) __extension__ ({ \
+#define vqrshrun_high_n_s16(__p0_198, __p1_198, __p2_198) __extension__ ({ \
   int8x8_t __s0_198 = __p0_198; \
   int16x8_t __s1_198 = __p1_198; \
   int8x16_t __ret_198; \
-  __ret_198 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_198), (int8x8_t)(vqshrn_n_s16(__s1_198, __p2_198)))); \
+  __ret_198 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_198), (int8x8_t)(vqrshrun_n_s16(__s1_198, __p2_198)))); \
   __ret_198; \
 })
 #else
-#define vqshrn_high_n_s16(__p0_199, __p1_199, __p2_199) __extension__ ({ \
+#define vqrshrun_high_n_s16(__p0_199, __p1_199, __p2_199) __extension__ ({ \
   int8x8_t __s0_199 = __p0_199; \
   int16x8_t __s1_199 = __p1_199; \
   int8x8_t __rev0_199;  __rev0_199 = __builtin_shufflevector(__s0_199, __s0_199, 7, 6, 5, 4, 3, 2, 1, 0); \
   int16x8_t __rev1_199;  __rev1_199 = __builtin_shufflevector(__s1_199, __s1_199, 7, 6, 5, 4, 3, 2, 1, 0); \
   int8x16_t __ret_199; \
-  __ret_199 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_199), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_199, __p2_199)))); \
+  __ret_199 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_199), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_199, __p2_199)))); \
   __ret_199 = __builtin_shufflevector(__ret_199, __ret_199, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_199; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __s0 = __p0; \
-  uint16_t __ret; \
-  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __s0 = __p0; \
-  uint16_t __ret; \
-  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __s0 = __p0; \
-  uint32_t __ret; \
-  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __s0 = __p0; \
-  uint32_t __ret; \
-  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __s0 = __p0; \
-  uint8_t __ret; \
-  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __s0 = __p0; \
-  uint8_t __ret; \
-  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
+#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
   int32_t __s0 = __p0; \
   int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
+  __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
   __ret; \
 })
 #else
-#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
+#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
   int32_t __s0 = __p0; \
   int16_t __ret; \
-  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
+  __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
   __ret; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
+#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
   int64_t __s0 = __p0; \
   int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
+  __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
   __ret; \
 })
 #else
-#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
+#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
   int64_t __s0 = __p0; \
   int32_t __ret; \
-  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
+  __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
   __ret; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
+#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
   int16_t __s0 = __p0; \
   int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
+  __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
   __ret; \
 })
 #else
-#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
+#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
   int16_t __s0 = __p0; \
   int8_t __ret; \
-  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
+  __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
   __ret; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s32(__p0_200, __p1_200, __p2_200) __extension__ ({ \
-  int16x4_t __s0_200 = __p0_200; \
-  int32x4_t __s1_200 = __p1_200; \
-  int16x8_t __ret_200; \
-  __ret_200 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_200), (int16x4_t)(vqshrun_n_s32(__s1_200, __p2_200)))); \
+__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_u32(__p0_200, __p1_200, __p2_200) __extension__ ({ \
+  uint16x4_t __s0_200 = __p0_200; \
+  uint32x4_t __s1_200 = __p1_200; \
+  uint16x8_t __ret_200; \
+  __ret_200 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_200), (uint16x4_t)(vqshrn_n_u32(__s1_200, __p2_200)))); \
   __ret_200; \
 })
 #else
-#define vqshrun_high_n_s32(__p0_201, __p1_201, __p2_201) __extension__ ({ \
-  int16x4_t __s0_201 = __p0_201; \
-  int32x4_t __s1_201 = __p1_201; \
-  int16x4_t __rev0_201;  __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \
-  int32x4_t __rev1_201;  __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \
-  int16x8_t __ret_201; \
-  __ret_201 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_201), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_201, __p2_201)))); \
+#define vqshrn_high_n_u32(__p0_201, __p1_201, __p2_201) __extension__ ({ \
+  uint16x4_t __s0_201 = __p0_201; \
+  uint32x4_t __s1_201 = __p1_201; \
+  uint16x4_t __rev0_201;  __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \
+  uint32x4_t __rev1_201;  __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \
+  uint16x8_t __ret_201; \
+  __ret_201 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_201), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_201, __p2_201)))); \
   __ret_201 = __builtin_shufflevector(__ret_201, __ret_201, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_201; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s64(__p0_202, __p1_202, __p2_202) __extension__ ({ \
-  int32x2_t __s0_202 = __p0_202; \
-  int64x2_t __s1_202 = __p1_202; \
-  int32x4_t __ret_202; \
-  __ret_202 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_202), (int32x2_t)(vqshrun_n_s64(__s1_202, __p2_202)))); \
+#define vqshrn_high_n_u64(__p0_202, __p1_202, __p2_202) __extension__ ({ \
+  uint32x2_t __s0_202 = __p0_202; \
+  uint64x2_t __s1_202 = __p1_202; \
+  uint32x4_t __ret_202; \
+  __ret_202 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_202), (uint32x2_t)(vqshrn_n_u64(__s1_202, __p2_202)))); \
   __ret_202; \
 })
 #else
-#define vqshrun_high_n_s64(__p0_203, __p1_203, __p2_203) __extension__ ({ \
-  int32x2_t __s0_203 = __p0_203; \
-  int64x2_t __s1_203 = __p1_203; \
-  int32x2_t __rev0_203;  __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \
-  int64x2_t __rev1_203;  __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \
-  int32x4_t __ret_203; \
-  __ret_203 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_203), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_203, __p2_203)))); \
+#define vqshrn_high_n_u64(__p0_203, __p1_203, __p2_203) __extension__ ({ \
+  uint32x2_t __s0_203 = __p0_203; \
+  uint64x2_t __s1_203 = __p1_203; \
+  uint32x2_t __rev0_203;  __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \
+  uint64x2_t __rev1_203;  __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \
+  uint32x4_t __ret_203; \
+  __ret_203 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_203), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_203, __p2_203)))); \
   __ret_203 = __builtin_shufflevector(__ret_203, __ret_203, 3, 2, 1, 0); \
   __ret_203; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s16(__p0_204, __p1_204, __p2_204) __extension__ ({ \
-  int8x8_t __s0_204 = __p0_204; \
-  int16x8_t __s1_204 = __p1_204; \
-  int8x16_t __ret_204; \
-  __ret_204 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_204), (int8x8_t)(vqshrun_n_s16(__s1_204, __p2_204)))); \
+#define vqshrn_high_n_u16(__p0_204, __p1_204, __p2_204) __extension__ ({ \
+  uint8x8_t __s0_204 = __p0_204; \
+  uint16x8_t __s1_204 = __p1_204; \
+  uint8x16_t __ret_204; \
+  __ret_204 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_204), (uint8x8_t)(vqshrn_n_u16(__s1_204, __p2_204)))); \
   __ret_204; \
 })
 #else
-#define vqshrun_high_n_s16(__p0_205, __p1_205, __p2_205) __extension__ ({ \
-  int8x8_t __s0_205 = __p0_205; \
-  int16x8_t __s1_205 = __p1_205; \
-  int8x8_t __rev0_205;  __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_205;  __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_205; \
-  __ret_205 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_205), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_205, __p2_205)))); \
+#define vqshrn_high_n_u16(__p0_205, __p1_205, __p2_205) __extension__ ({ \
+  uint8x8_t __s0_205 = __p0_205; \
+  uint16x8_t __s1_205 = __p1_205; \
+  uint8x8_t __rev0_205;  __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_205;  __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_205; \
+  __ret_205 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_205), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_205, __p2_205)))); \
   __ret_205 = __builtin_shufflevector(__ret_205, __ret_205, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_205; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s32(__p0_206, __p1_206, __p2_206) __extension__ ({ \
+  int16x4_t __s0_206 = __p0_206; \
+  int32x4_t __s1_206 = __p1_206; \
+  int16x8_t __ret_206; \
+  __ret_206 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_206), (int16x4_t)(vqshrn_n_s32(__s1_206, __p2_206)))); \
+  __ret_206; \
+})
+#else
+#define vqshrn_high_n_s32(__p0_207, __p1_207, __p2_207) __extension__ ({ \
+  int16x4_t __s0_207 = __p0_207; \
+  int32x4_t __s1_207 = __p1_207; \
+  int16x4_t __rev0_207;  __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 3, 2, 1, 0); \
+  int32x4_t __rev1_207;  __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 3, 2, 1, 0); \
+  int16x8_t __ret_207; \
+  __ret_207 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_207), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_207, __p2_207)))); \
+  __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_207; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s64(__p0_208, __p1_208, __p2_208) __extension__ ({ \
+  int32x2_t __s0_208 = __p0_208; \
+  int64x2_t __s1_208 = __p1_208; \
+  int32x4_t __ret_208; \
+  __ret_208 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_208), (int32x2_t)(vqshrn_n_s64(__s1_208, __p2_208)))); \
+  __ret_208; \
+})
+#else
+#define vqshrn_high_n_s64(__p0_209, __p1_209, __p2_209) __extension__ ({ \
+  int32x2_t __s0_209 = __p0_209; \
+  int64x2_t __s1_209 = __p1_209; \
+  int32x2_t __rev0_209;  __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 1, 0); \
+  int64x2_t __rev1_209;  __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 1, 0); \
+  int32x4_t __ret_209; \
+  __ret_209 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_209), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_209, __p2_209)))); \
+  __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \
+  __ret_209; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s16(__p0_210, __p1_210, __p2_210) __extension__ ({ \
+  int8x8_t __s0_210 = __p0_210; \
+  int16x8_t __s1_210 = __p1_210; \
+  int8x16_t __ret_210; \
+  __ret_210 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_210), (int8x8_t)(vqshrn_n_s16(__s1_210, __p2_210)))); \
+  __ret_210; \
+})
+#else
+#define vqshrn_high_n_s16(__p0_211, __p1_211, __p2_211) __extension__ ({ \
+  int8x8_t __s0_211 = __p0_211; \
+  int16x8_t __s1_211 = __p1_211; \
+  int8x8_t __rev0_211;  __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_211;  __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_211; \
+  __ret_211 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_211), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_211, __p2_211)))); \
+  __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_211; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s32(__p0_212, __p1_212, __p2_212) __extension__ ({ \
+  int16x4_t __s0_212 = __p0_212; \
+  int32x4_t __s1_212 = __p1_212; \
+  int16x8_t __ret_212; \
+  __ret_212 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_212), (int16x4_t)(vqshrun_n_s32(__s1_212, __p2_212)))); \
+  __ret_212; \
+})
+#else
+#define vqshrun_high_n_s32(__p0_213, __p1_213, __p2_213) __extension__ ({ \
+  int16x4_t __s0_213 = __p0_213; \
+  int32x4_t __s1_213 = __p1_213; \
+  int16x4_t __rev0_213;  __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \
+  int32x4_t __rev1_213;  __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \
+  int16x8_t __ret_213; \
+  __ret_213 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_213), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_213, __p2_213)))); \
+  __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_213; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s64(__p0_214, __p1_214, __p2_214) __extension__ ({ \
+  int32x2_t __s0_214 = __p0_214; \
+  int64x2_t __s1_214 = __p1_214; \
+  int32x4_t __ret_214; \
+  __ret_214 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_214), (int32x2_t)(vqshrun_n_s64(__s1_214, __p2_214)))); \
+  __ret_214; \
+})
+#else
+#define vqshrun_high_n_s64(__p0_215, __p1_215, __p2_215) __extension__ ({ \
+  int32x2_t __s0_215 = __p0_215; \
+  int64x2_t __s1_215 = __p1_215; \
+  int32x2_t __rev0_215;  __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 1, 0); \
+  int64x2_t __rev1_215;  __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \
+  int32x4_t __ret_215; \
+  __ret_215 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_215), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_215, __p2_215)))); \
+  __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \
+  __ret_215; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \
+  int8x8_t __s0_216 = __p0_216; \
+  int16x8_t __s1_216 = __p1_216; \
+  int8x16_t __ret_216; \
+  __ret_216 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_216), (int8x8_t)(vqshrun_n_s16(__s1_216, __p2_216)))); \
+  __ret_216; \
+})
+#else
+#define vqshrun_high_n_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \
+  int8x8_t __s0_217 = __p0_217; \
+  int16x8_t __s1_217 = __p1_217; \
+  int8x8_t __rev0_217;  __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_217;  __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_217; \
+  __ret_217 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_217), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_217, __p2_217)))); \
+  __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_217; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 #define vqshruns_n_s32(__p0, __p1) __extension__ ({ \
   int32_t __s0 = __p0; \
   int16_t __ret; \
@@ -60265,128 +65215,128 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u32(__p0_206, __p1_206, __p2_206) __extension__ ({ \
-  uint16x4_t __s0_206 = __p0_206; \
-  uint32x4_t __s1_206 = __p1_206; \
-  uint16x8_t __ret_206; \
-  __ret_206 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_206), (uint16x4_t)(vrshrn_n_u32(__s1_206, __p2_206)))); \
-  __ret_206; \
+#define vrshrn_high_n_u32(__p0_218, __p1_218, __p2_218) __extension__ ({ \
+  uint16x4_t __s0_218 = __p0_218; \
+  uint32x4_t __s1_218 = __p1_218; \
+  uint16x8_t __ret_218; \
+  __ret_218 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_218), (uint16x4_t)(vrshrn_n_u32(__s1_218, __p2_218)))); \
+  __ret_218; \
 })
 #else
-#define vrshrn_high_n_u32(__p0_207, __p1_207, __p2_207) __extension__ ({ \
-  uint16x4_t __s0_207 = __p0_207; \
-  uint32x4_t __s1_207 = __p1_207; \
-  uint16x4_t __rev0_207;  __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 3, 2, 1, 0); \
-  uint32x4_t __rev1_207;  __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 3, 2, 1, 0); \
-  uint16x8_t __ret_207; \
-  __ret_207 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_207), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_207, __p2_207)))); \
-  __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_207; \
+#define vrshrn_high_n_u32(__p0_219, __p1_219, __p2_219) __extension__ ({ \
+  uint16x4_t __s0_219 = __p0_219; \
+  uint32x4_t __s1_219 = __p1_219; \
+  uint16x4_t __rev0_219;  __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 3, 2, 1, 0); \
+  uint32x4_t __rev1_219;  __rev1_219 = __builtin_shufflevector(__s1_219, __s1_219, 3, 2, 1, 0); \
+  uint16x8_t __ret_219; \
+  __ret_219 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_219), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_219, __p2_219)))); \
+  __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_219; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u64(__p0_208, __p1_208, __p2_208) __extension__ ({ \
-  uint32x2_t __s0_208 = __p0_208; \
-  uint64x2_t __s1_208 = __p1_208; \
-  uint32x4_t __ret_208; \
-  __ret_208 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_208), (uint32x2_t)(vrshrn_n_u64(__s1_208, __p2_208)))); \
-  __ret_208; \
+#define vrshrn_high_n_u64(__p0_220, __p1_220, __p2_220) __extension__ ({ \
+  uint32x2_t __s0_220 = __p0_220; \
+  uint64x2_t __s1_220 = __p1_220; \
+  uint32x4_t __ret_220; \
+  __ret_220 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_220), (uint32x2_t)(vrshrn_n_u64(__s1_220, __p2_220)))); \
+  __ret_220; \
 })
 #else
-#define vrshrn_high_n_u64(__p0_209, __p1_209, __p2_209) __extension__ ({ \
-  uint32x2_t __s0_209 = __p0_209; \
-  uint64x2_t __s1_209 = __p1_209; \
-  uint32x2_t __rev0_209;  __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 1, 0); \
-  uint64x2_t __rev1_209;  __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 1, 0); \
-  uint32x4_t __ret_209; \
-  __ret_209 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_209), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_209, __p2_209)))); \
-  __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \
-  __ret_209; \
+#define vrshrn_high_n_u64(__p0_221, __p1_221, __p2_221) __extension__ ({ \
+  uint32x2_t __s0_221 = __p0_221; \
+  uint64x2_t __s1_221 = __p1_221; \
+  uint32x2_t __rev0_221;  __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 1, 0); \
+  uint64x2_t __rev1_221;  __rev1_221 = __builtin_shufflevector(__s1_221, __s1_221, 1, 0); \
+  uint32x4_t __ret_221; \
+  __ret_221 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_221), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_221, __p2_221)))); \
+  __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 3, 2, 1, 0); \
+  __ret_221; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u16(__p0_210, __p1_210, __p2_210) __extension__ ({ \
-  uint8x8_t __s0_210 = __p0_210; \
-  uint16x8_t __s1_210 = __p1_210; \
-  uint8x16_t __ret_210; \
-  __ret_210 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_210), (uint8x8_t)(vrshrn_n_u16(__s1_210, __p2_210)))); \
-  __ret_210; \
+#define vrshrn_high_n_u16(__p0_222, __p1_222, __p2_222) __extension__ ({ \
+  uint8x8_t __s0_222 = __p0_222; \
+  uint16x8_t __s1_222 = __p1_222; \
+  uint8x16_t __ret_222; \
+  __ret_222 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_222), (uint8x8_t)(vrshrn_n_u16(__s1_222, __p2_222)))); \
+  __ret_222; \
 })
 #else
-#define vrshrn_high_n_u16(__p0_211, __p1_211, __p2_211) __extension__ ({ \
-  uint8x8_t __s0_211 = __p0_211; \
-  uint16x8_t __s1_211 = __p1_211; \
-  uint8x8_t __rev0_211;  __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_211;  __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_211; \
-  __ret_211 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_211), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_211, __p2_211)))); \
-  __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_211; \
+#define vrshrn_high_n_u16(__p0_223, __p1_223, __p2_223) __extension__ ({ \
+  uint8x8_t __s0_223 = __p0_223; \
+  uint16x8_t __s1_223 = __p1_223; \
+  uint8x8_t __rev0_223;  __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_223;  __rev1_223 = __builtin_shufflevector(__s1_223, __s1_223, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_223; \
+  __ret_223 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_223), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_223, __p2_223)))); \
+  __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_223; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s32(__p0_212, __p1_212, __p2_212) __extension__ ({ \
-  int16x4_t __s0_212 = __p0_212; \
-  int32x4_t __s1_212 = __p1_212; \
-  int16x8_t __ret_212; \
-  __ret_212 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_212), (int16x4_t)(vrshrn_n_s32(__s1_212, __p2_212)))); \
-  __ret_212; \
+#define vrshrn_high_n_s32(__p0_224, __p1_224, __p2_224) __extension__ ({ \
+  int16x4_t __s0_224 = __p0_224; \
+  int32x4_t __s1_224 = __p1_224; \
+  int16x8_t __ret_224; \
+  __ret_224 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_224), (int16x4_t)(vrshrn_n_s32(__s1_224, __p2_224)))); \
+  __ret_224; \
 })
 #else
-#define vrshrn_high_n_s32(__p0_213, __p1_213, __p2_213) __extension__ ({ \
-  int16x4_t __s0_213 = __p0_213; \
-  int32x4_t __s1_213 = __p1_213; \
-  int16x4_t __rev0_213;  __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \
-  int32x4_t __rev1_213;  __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \
-  int16x8_t __ret_213; \
-  __ret_213 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_213), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_213, __p2_213)))); \
-  __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_213; \
+#define vrshrn_high_n_s32(__p0_225, __p1_225, __p2_225) __extension__ ({ \
+  int16x4_t __s0_225 = __p0_225; \
+  int32x4_t __s1_225 = __p1_225; \
+  int16x4_t __rev0_225;  __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 3, 2, 1, 0); \
+  int32x4_t __rev1_225;  __rev1_225 = __builtin_shufflevector(__s1_225, __s1_225, 3, 2, 1, 0); \
+  int16x8_t __ret_225; \
+  __ret_225 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_225), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_225, __p2_225)))); \
+  __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_225; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s64(__p0_214, __p1_214, __p2_214) __extension__ ({ \
-  int32x2_t __s0_214 = __p0_214; \
-  int64x2_t __s1_214 = __p1_214; \
-  int32x4_t __ret_214; \
-  __ret_214 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_214), (int32x2_t)(vrshrn_n_s64(__s1_214, __p2_214)))); \
-  __ret_214; \
+#define vrshrn_high_n_s64(__p0_226, __p1_226, __p2_226) __extension__ ({ \
+  int32x2_t __s0_226 = __p0_226; \
+  int64x2_t __s1_226 = __p1_226; \
+  int32x4_t __ret_226; \
+  __ret_226 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_226), (int32x2_t)(vrshrn_n_s64(__s1_226, __p2_226)))); \
+  __ret_226; \
 })
 #else
-#define vrshrn_high_n_s64(__p0_215, __p1_215, __p2_215) __extension__ ({ \
-  int32x2_t __s0_215 = __p0_215; \
-  int64x2_t __s1_215 = __p1_215; \
-  int32x2_t __rev0_215;  __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 1, 0); \
-  int64x2_t __rev1_215;  __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \
-  int32x4_t __ret_215; \
-  __ret_215 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_215), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_215, __p2_215)))); \
-  __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \
-  __ret_215; \
+#define vrshrn_high_n_s64(__p0_227, __p1_227, __p2_227) __extension__ ({ \
+  int32x2_t __s0_227 = __p0_227; \
+  int64x2_t __s1_227 = __p1_227; \
+  int32x2_t __rev0_227;  __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 1, 0); \
+  int64x2_t __rev1_227;  __rev1_227 = __builtin_shufflevector(__s1_227, __s1_227, 1, 0); \
+  int32x4_t __ret_227; \
+  __ret_227 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_227), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_227, __p2_227)))); \
+  __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 3, 2, 1, 0); \
+  __ret_227; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \
-  int8x8_t __s0_216 = __p0_216; \
-  int16x8_t __s1_216 = __p1_216; \
-  int8x16_t __ret_216; \
-  __ret_216 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_216), (int8x8_t)(vrshrn_n_s16(__s1_216, __p2_216)))); \
-  __ret_216; \
+#define vrshrn_high_n_s16(__p0_228, __p1_228, __p2_228) __extension__ ({ \
+  int8x8_t __s0_228 = __p0_228; \
+  int16x8_t __s1_228 = __p1_228; \
+  int8x16_t __ret_228; \
+  __ret_228 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_228), (int8x8_t)(vrshrn_n_s16(__s1_228, __p2_228)))); \
+  __ret_228; \
 })
 #else
-#define vrshrn_high_n_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \
-  int8x8_t __s0_217 = __p0_217; \
-  int16x8_t __s1_217 = __p1_217; \
-  int8x8_t __rev0_217;  __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_217;  __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_217; \
-  __ret_217 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_217), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_217, __p2_217)))); \
-  __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_217; \
+#define vrshrn_high_n_s16(__p0_229, __p1_229, __p2_229) __extension__ ({ \
+  int8x8_t __s0_229 = __p0_229; \
+  int16x8_t __s1_229 = __p1_229; \
+  int8x8_t __rev0_229;  __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_229;  __rev1_229 = __builtin_shufflevector(__s1_229, __s1_229, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_229; \
+  __ret_229 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_229), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_229, __p2_229)))); \
+  __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_229; \
 })
 #endif
 
@@ -60816,272 +65766,272 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u8(__p0_218, __p1_218) __extension__ ({ \
-  uint8x16_t __s0_218 = __p0_218; \
-  uint16x8_t __ret_218; \
-  __ret_218 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_218), __p1_218)); \
-  __ret_218; \
-})
-#else
-#define vshll_high_n_u8(__p0_219, __p1_219) __extension__ ({ \
-  uint8x16_t __s0_219 = __p0_219; \
-  uint8x16_t __rev0_219;  __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __ret_219; \
-  __ret_219 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_219), __p1_219)); \
-  __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_219; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u32(__p0_220, __p1_220) __extension__ ({ \
-  uint32x4_t __s0_220 = __p0_220; \
-  uint64x2_t __ret_220; \
-  __ret_220 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_220), __p1_220)); \
-  __ret_220; \
-})
-#else
-#define vshll_high_n_u32(__p0_221, __p1_221) __extension__ ({ \
-  uint32x4_t __s0_221 = __p0_221; \
-  uint32x4_t __rev0_221;  __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 3, 2, 1, 0); \
-  uint64x2_t __ret_221; \
-  __ret_221 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_221), __p1_221)); \
-  __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 1, 0); \
-  __ret_221; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u16(__p0_222, __p1_222) __extension__ ({ \
-  uint16x8_t __s0_222 = __p0_222; \
-  uint32x4_t __ret_222; \
-  __ret_222 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_222), __p1_222)); \
-  __ret_222; \
-})
-#else
-#define vshll_high_n_u16(__p0_223, __p1_223) __extension__ ({ \
-  uint16x8_t __s0_223 = __p0_223; \
-  uint16x8_t __rev0_223;  __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_223; \
-  __ret_223 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_223), __p1_223)); \
-  __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 3, 2, 1, 0); \
-  __ret_223; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s8(__p0_224, __p1_224) __extension__ ({ \
-  int8x16_t __s0_224 = __p0_224; \
-  int16x8_t __ret_224; \
-  __ret_224 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_224), __p1_224)); \
-  __ret_224; \
-})
-#else
-#define vshll_high_n_s8(__p0_225, __p1_225) __extension__ ({ \
-  int8x16_t __s0_225 = __p0_225; \
-  int8x16_t __rev0_225;  __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_225; \
-  __ret_225 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_225), __p1_225)); \
-  __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_225; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s32(__p0_226, __p1_226) __extension__ ({ \
-  int32x4_t __s0_226 = __p0_226; \
-  int64x2_t __ret_226; \
-  __ret_226 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_226), __p1_226)); \
-  __ret_226; \
-})
-#else
-#define vshll_high_n_s32(__p0_227, __p1_227) __extension__ ({ \
-  int32x4_t __s0_227 = __p0_227; \
-  int32x4_t __rev0_227;  __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 3, 2, 1, 0); \
-  int64x2_t __ret_227; \
-  __ret_227 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_227), __p1_227)); \
-  __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 1, 0); \
-  __ret_227; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s16(__p0_228, __p1_228) __extension__ ({ \
-  int16x8_t __s0_228 = __p0_228; \
-  int32x4_t __ret_228; \
-  __ret_228 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_228), __p1_228)); \
-  __ret_228; \
-})
-#else
-#define vshll_high_n_s16(__p0_229, __p1_229) __extension__ ({ \
-  int16x8_t __s0_229 = __p0_229; \
-  int16x8_t __rev0_229;  __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_229; \
-  __ret_229 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_229), __p1_229)); \
-  __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 3, 2, 1, 0); \
-  __ret_229; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __s0 = __p0; \
-  uint64_t __ret; \
-  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __s0 = __p0; \
-  uint64_t __ret; \
-  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int64_t __ret; \
-  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
-  __ret; \
-})
-#else
-#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __s0 = __p0; \
-  int64_t __ret; \
-  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u32(__p0_230, __p1_230, __p2_230) __extension__ ({ \
-  uint16x4_t __s0_230 = __p0_230; \
-  uint32x4_t __s1_230 = __p1_230; \
+#define vshll_high_n_u8(__p0_230, __p1_230) __extension__ ({ \
+  uint8x16_t __s0_230 = __p0_230; \
   uint16x8_t __ret_230; \
-  __ret_230 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_230), (uint16x4_t)(vshrn_n_u32(__s1_230, __p2_230)))); \
+  __ret_230 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_230), __p1_230)); \
   __ret_230; \
 })
 #else
-#define vshrn_high_n_u32(__p0_231, __p1_231, __p2_231) __extension__ ({ \
-  uint16x4_t __s0_231 = __p0_231; \
-  uint32x4_t __s1_231 = __p1_231; \
-  uint16x4_t __rev0_231;  __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 3, 2, 1, 0); \
-  uint32x4_t __rev1_231;  __rev1_231 = __builtin_shufflevector(__s1_231, __s1_231, 3, 2, 1, 0); \
+#define vshll_high_n_u8(__p0_231, __p1_231) __extension__ ({ \
+  uint8x16_t __s0_231 = __p0_231; \
+  uint8x16_t __rev0_231;  __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   uint16x8_t __ret_231; \
-  __ret_231 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_231), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_231, __p2_231)))); \
+  __ret_231 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_231), __p1_231)); \
   __ret_231 = __builtin_shufflevector(__ret_231, __ret_231, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_231; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u64(__p0_232, __p1_232, __p2_232) __extension__ ({ \
-  uint32x2_t __s0_232 = __p0_232; \
-  uint64x2_t __s1_232 = __p1_232; \
-  uint32x4_t __ret_232; \
-  __ret_232 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_232), (uint32x2_t)(vshrn_n_u64(__s1_232, __p2_232)))); \
+#define vshll_high_n_u32(__p0_232, __p1_232) __extension__ ({ \
+  uint32x4_t __s0_232 = __p0_232; \
+  uint64x2_t __ret_232; \
+  __ret_232 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_232), __p1_232)); \
   __ret_232; \
 })
 #else
-#define vshrn_high_n_u64(__p0_233, __p1_233, __p2_233) __extension__ ({ \
-  uint32x2_t __s0_233 = __p0_233; \
-  uint64x2_t __s1_233 = __p1_233; \
-  uint32x2_t __rev0_233;  __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 1, 0); \
-  uint64x2_t __rev1_233;  __rev1_233 = __builtin_shufflevector(__s1_233, __s1_233, 1, 0); \
-  uint32x4_t __ret_233; \
-  __ret_233 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_233), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_233, __p2_233)))); \
-  __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 3, 2, 1, 0); \
+#define vshll_high_n_u32(__p0_233, __p1_233) __extension__ ({ \
+  uint32x4_t __s0_233 = __p0_233; \
+  uint32x4_t __rev0_233;  __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 3, 2, 1, 0); \
+  uint64x2_t __ret_233; \
+  __ret_233 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_233), __p1_233)); \
+  __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 1, 0); \
   __ret_233; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u16(__p0_234, __p1_234, __p2_234) __extension__ ({ \
-  uint8x8_t __s0_234 = __p0_234; \
-  uint16x8_t __s1_234 = __p1_234; \
-  uint8x16_t __ret_234; \
-  __ret_234 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_234), (uint8x8_t)(vshrn_n_u16(__s1_234, __p2_234)))); \
+#define vshll_high_n_u16(__p0_234, __p1_234) __extension__ ({ \
+  uint16x8_t __s0_234 = __p0_234; \
+  uint32x4_t __ret_234; \
+  __ret_234 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_234), __p1_234)); \
   __ret_234; \
 })
 #else
-#define vshrn_high_n_u16(__p0_235, __p1_235, __p2_235) __extension__ ({ \
-  uint8x8_t __s0_235 = __p0_235; \
-  uint16x8_t __s1_235 = __p1_235; \
-  uint8x8_t __rev0_235;  __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_235;  __rev1_235 = __builtin_shufflevector(__s1_235, __s1_235, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_235; \
-  __ret_235 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_235), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_235, __p2_235)))); \
-  __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vshll_high_n_u16(__p0_235, __p1_235) __extension__ ({ \
+  uint16x8_t __s0_235 = __p0_235; \
+  uint16x8_t __rev0_235;  __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_235; \
+  __ret_235 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_235), __p1_235)); \
+  __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 3, 2, 1, 0); \
   __ret_235; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s32(__p0_236, __p1_236, __p2_236) __extension__ ({ \
-  int16x4_t __s0_236 = __p0_236; \
-  int32x4_t __s1_236 = __p1_236; \
+#define vshll_high_n_s8(__p0_236, __p1_236) __extension__ ({ \
+  int8x16_t __s0_236 = __p0_236; \
   int16x8_t __ret_236; \
-  __ret_236 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_236), (int16x4_t)(vshrn_n_s32(__s1_236, __p2_236)))); \
+  __ret_236 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_236), __p1_236)); \
   __ret_236; \
 })
 #else
-#define vshrn_high_n_s32(__p0_237, __p1_237, __p2_237) __extension__ ({ \
-  int16x4_t __s0_237 = __p0_237; \
-  int32x4_t __s1_237 = __p1_237; \
-  int16x4_t __rev0_237;  __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 3, 2, 1, 0); \
-  int32x4_t __rev1_237;  __rev1_237 = __builtin_shufflevector(__s1_237, __s1_237, 3, 2, 1, 0); \
+#define vshll_high_n_s8(__p0_237, __p1_237) __extension__ ({ \
+  int8x16_t __s0_237 = __p0_237; \
+  int8x16_t __rev0_237;  __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
   int16x8_t __ret_237; \
-  __ret_237 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_237), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_237, __p2_237)))); \
+  __ret_237 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_237), __p1_237)); \
   __ret_237 = __builtin_shufflevector(__ret_237, __ret_237, 7, 6, 5, 4, 3, 2, 1, 0); \
   __ret_237; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s64(__p0_238, __p1_238, __p2_238) __extension__ ({ \
-  int32x2_t __s0_238 = __p0_238; \
-  int64x2_t __s1_238 = __p1_238; \
-  int32x4_t __ret_238; \
-  __ret_238 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_238), (int32x2_t)(vshrn_n_s64(__s1_238, __p2_238)))); \
+#define vshll_high_n_s32(__p0_238, __p1_238) __extension__ ({ \
+  int32x4_t __s0_238 = __p0_238; \
+  int64x2_t __ret_238; \
+  __ret_238 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_238), __p1_238)); \
   __ret_238; \
 })
 #else
-#define vshrn_high_n_s64(__p0_239, __p1_239, __p2_239) __extension__ ({ \
-  int32x2_t __s0_239 = __p0_239; \
-  int64x2_t __s1_239 = __p1_239; \
-  int32x2_t __rev0_239;  __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 1, 0); \
-  int64x2_t __rev1_239;  __rev1_239 = __builtin_shufflevector(__s1_239, __s1_239, 1, 0); \
-  int32x4_t __ret_239; \
-  __ret_239 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_239), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_239, __p2_239)))); \
-  __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 3, 2, 1, 0); \
+#define vshll_high_n_s32(__p0_239, __p1_239) __extension__ ({ \
+  int32x4_t __s0_239 = __p0_239; \
+  int32x4_t __rev0_239;  __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 3, 2, 1, 0); \
+  int64x2_t __ret_239; \
+  __ret_239 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_239), __p1_239)); \
+  __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 1, 0); \
   __ret_239; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s16(__p0_240, __p1_240, __p2_240) __extension__ ({ \
-  int8x8_t __s0_240 = __p0_240; \
-  int16x8_t __s1_240 = __p1_240; \
-  int8x16_t __ret_240; \
-  __ret_240 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_240), (int8x8_t)(vshrn_n_s16(__s1_240, __p2_240)))); \
+#define vshll_high_n_s16(__p0_240, __p1_240) __extension__ ({ \
+  int16x8_t __s0_240 = __p0_240; \
+  int32x4_t __ret_240; \
+  __ret_240 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_240), __p1_240)); \
   __ret_240; \
 })
 #else
-#define vshrn_high_n_s16(__p0_241, __p1_241, __p2_241) __extension__ ({ \
-  int8x8_t __s0_241 = __p0_241; \
-  int16x8_t __s1_241 = __p1_241; \
-  int8x8_t __rev0_241;  __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_241;  __rev1_241 = __builtin_shufflevector(__s1_241, __s1_241, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_241; \
-  __ret_241 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_241), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_241, __p2_241)))); \
-  __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+#define vshll_high_n_s16(__p0_241, __p1_241) __extension__ ({ \
+  int16x8_t __s0_241 = __p0_241; \
+  int16x8_t __rev0_241;  __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_241; \
+  __ret_241 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_241), __p1_241)); \
+  __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 3, 2, 1, 0); \
   __ret_241; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
+#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u32(__p0_242, __p1_242, __p2_242) __extension__ ({ \
+  uint16x4_t __s0_242 = __p0_242; \
+  uint32x4_t __s1_242 = __p1_242; \
+  uint16x8_t __ret_242; \
+  __ret_242 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_242), (uint16x4_t)(vshrn_n_u32(__s1_242, __p2_242)))); \
+  __ret_242; \
+})
+#else
+#define vshrn_high_n_u32(__p0_243, __p1_243, __p2_243) __extension__ ({ \
+  uint16x4_t __s0_243 = __p0_243; \
+  uint32x4_t __s1_243 = __p1_243; \
+  uint16x4_t __rev0_243;  __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 3, 2, 1, 0); \
+  uint32x4_t __rev1_243;  __rev1_243 = __builtin_shufflevector(__s1_243, __s1_243, 3, 2, 1, 0); \
+  uint16x8_t __ret_243; \
+  __ret_243 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_243), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_243, __p2_243)))); \
+  __ret_243 = __builtin_shufflevector(__ret_243, __ret_243, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_243; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u64(__p0_244, __p1_244, __p2_244) __extension__ ({ \
+  uint32x2_t __s0_244 = __p0_244; \
+  uint64x2_t __s1_244 = __p1_244; \
+  uint32x4_t __ret_244; \
+  __ret_244 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_244), (uint32x2_t)(vshrn_n_u64(__s1_244, __p2_244)))); \
+  __ret_244; \
+})
+#else
+#define vshrn_high_n_u64(__p0_245, __p1_245, __p2_245) __extension__ ({ \
+  uint32x2_t __s0_245 = __p0_245; \
+  uint64x2_t __s1_245 = __p1_245; \
+  uint32x2_t __rev0_245;  __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 1, 0); \
+  uint64x2_t __rev1_245;  __rev1_245 = __builtin_shufflevector(__s1_245, __s1_245, 1, 0); \
+  uint32x4_t __ret_245; \
+  __ret_245 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_245), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_245, __p2_245)))); \
+  __ret_245 = __builtin_shufflevector(__ret_245, __ret_245, 3, 2, 1, 0); \
+  __ret_245; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u16(__p0_246, __p1_246, __p2_246) __extension__ ({ \
+  uint8x8_t __s0_246 = __p0_246; \
+  uint16x8_t __s1_246 = __p1_246; \
+  uint8x16_t __ret_246; \
+  __ret_246 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_246), (uint8x8_t)(vshrn_n_u16(__s1_246, __p2_246)))); \
+  __ret_246; \
+})
+#else
+#define vshrn_high_n_u16(__p0_247, __p1_247, __p2_247) __extension__ ({ \
+  uint8x8_t __s0_247 = __p0_247; \
+  uint16x8_t __s1_247 = __p1_247; \
+  uint8x8_t __rev0_247;  __rev0_247 = __builtin_shufflevector(__s0_247, __s0_247, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_247;  __rev1_247 = __builtin_shufflevector(__s1_247, __s1_247, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_247; \
+  __ret_247 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_247), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_247, __p2_247)))); \
+  __ret_247 = __builtin_shufflevector(__ret_247, __ret_247, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_247; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s32(__p0_248, __p1_248, __p2_248) __extension__ ({ \
+  int16x4_t __s0_248 = __p0_248; \
+  int32x4_t __s1_248 = __p1_248; \
+  int16x8_t __ret_248; \
+  __ret_248 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_248), (int16x4_t)(vshrn_n_s32(__s1_248, __p2_248)))); \
+  __ret_248; \
+})
+#else
+#define vshrn_high_n_s32(__p0_249, __p1_249, __p2_249) __extension__ ({ \
+  int16x4_t __s0_249 = __p0_249; \
+  int32x4_t __s1_249 = __p1_249; \
+  int16x4_t __rev0_249;  __rev0_249 = __builtin_shufflevector(__s0_249, __s0_249, 3, 2, 1, 0); \
+  int32x4_t __rev1_249;  __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 3, 2, 1, 0); \
+  int16x8_t __ret_249; \
+  __ret_249 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_249), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_249, __p2_249)))); \
+  __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_249; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s64(__p0_250, __p1_250, __p2_250) __extension__ ({ \
+  int32x2_t __s0_250 = __p0_250; \
+  int64x2_t __s1_250 = __p1_250; \
+  int32x4_t __ret_250; \
+  __ret_250 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_250), (int32x2_t)(vshrn_n_s64(__s1_250, __p2_250)))); \
+  __ret_250; \
+})
+#else
+#define vshrn_high_n_s64(__p0_251, __p1_251, __p2_251) __extension__ ({ \
+  int32x2_t __s0_251 = __p0_251; \
+  int64x2_t __s1_251 = __p1_251; \
+  int32x2_t __rev0_251;  __rev0_251 = __builtin_shufflevector(__s0_251, __s0_251, 1, 0); \
+  int64x2_t __rev1_251;  __rev1_251 = __builtin_shufflevector(__s1_251, __s1_251, 1, 0); \
+  int32x4_t __ret_251; \
+  __ret_251 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_251), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_251, __p2_251)))); \
+  __ret_251 = __builtin_shufflevector(__ret_251, __ret_251, 3, 2, 1, 0); \
+  __ret_251; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s16(__p0_252, __p1_252, __p2_252) __extension__ ({ \
+  int8x8_t __s0_252 = __p0_252; \
+  int16x8_t __s1_252 = __p1_252; \
+  int8x16_t __ret_252; \
+  __ret_252 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_252), (int8x8_t)(vshrn_n_s16(__s1_252, __p2_252)))); \
+  __ret_252; \
+})
+#else
+#define vshrn_high_n_s16(__p0_253, __p1_253, __p2_253) __extension__ ({ \
+  int8x8_t __s0_253 = __p0_253; \
+  int16x8_t __s1_253 = __p1_253; \
+  int8x8_t __rev0_253;  __rev0_253 = __builtin_shufflevector(__s0_253, __s0_253, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_253;  __rev1_253 = __builtin_shufflevector(__s1_253, __s1_253, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_253; \
+  __ret_253 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_253), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_253, __p2_253)))); \
+  __ret_253 = __builtin_shufflevector(__ret_253, __ret_253, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_253; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
 #define vslid_n_u64(__p0, __p1, __p2) __extension__ ({ \
   uint64_t __s0 = __p0; \
   uint64_t __s1 = __p1; \
@@ -61619,21 +66569,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
-  poly8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
-})
-#else
-#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
-  poly8x8x2_t __s1 = __p1; \
-  poly8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1_p64_x2(__p0, __p1) __extension__ ({ \
   poly64x1x2_t __s1 = __p1; \
   __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
@@ -61646,36 +66581,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
-  poly16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \
-})
-#else
-#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
-  poly16x4x2_t __s1 = __p1; \
-  poly16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
-  poly8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \
-})
-#else
-#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
-  poly8x16x2_t __s1 = __p1; \
-  poly8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1q_p64_x2(__p0, __p1) __extension__ ({ \
   poly64x2x2_t __s1 = __p1; \
   __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 38); \
@@ -61691,96 +66596,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
-  poly16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \
-})
-#else
-#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
-  poly16x8x2_t __s1 = __p1; \
-  poly16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
-  uint8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \
-})
-#else
-#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
-  uint8x16x2_t __s1 = __p1; \
-  uint8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
-  uint32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \
-})
-#else
-#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
-  uint32x4x2_t __s1 = __p1; \
-  uint32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
-  uint64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \
-})
-#else
-#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
-  uint64x2x2_t __s1 = __p1; \
-  uint64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
-  uint16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \
-})
-#else
-#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
-  uint16x8x2_t __s1 = __p1; \
-  uint16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
-  int8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \
-})
-#else
-#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
-  int8x16x2_t __s1 = __p1; \
-  int8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1q_f64_x2(__p0, __p1) __extension__ ({ \
   float64x2x2_t __s1 = __p1; \
   __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 42); \
@@ -61796,153 +66611,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
-  float32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 41); \
-})
-#else
-#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
-  float32x4x2_t __s1 = __p1; \
-  float32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
-  float16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 40); \
-})
-#else
-#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
-  float16x8x2_t __s1 = __p1; \
-  float16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
-  int32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 34); \
-})
-#else
-#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
-  int32x4x2_t __s1 = __p1; \
-  int32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
-  int64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 35); \
-})
-#else
-#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
-  int64x2x2_t __s1 = __p1; \
-  int64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 35); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
-  int16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 33); \
-})
-#else
-#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
-  int16x8x2_t __s1 = __p1; \
-  int16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
-  uint8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \
-})
-#else
-#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
-  uint8x8x2_t __s1 = __p1; \
-  uint8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
-  uint32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \
-})
-#else
-#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
-  uint32x2x2_t __s1 = __p1; \
-  uint32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
-  uint64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
-})
-#else
-#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
-  uint64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
-  uint16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \
-})
-#else
-#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
-  uint16x4x2_t __s1 = __p1; \
-  uint16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
-  int8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \
-})
-#else
-#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
-  int8x8x2_t __s1 = __p1; \
-  int8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1_f64_x2(__p0, __p1) __extension__ ({ \
   float64x1x2_t __s1 = __p1; \
   __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 10); \
@@ -61955,94 +66623,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
-  float32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 9); \
-})
-#else
-#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
-  float32x2x2_t __s1 = __p1; \
-  float32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
-  float16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 8); \
-})
-#else
-#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
-  float16x4x2_t __s1 = __p1; \
-  float16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
-  int32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 2); \
-})
-#else
-#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
-  int32x2x2_t __s1 = __p1; \
-  int32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 2); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
-  int64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \
-})
-#else
-#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
-  int64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
-  int16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 1); \
-})
-#else
-#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
-  int16x4x2_t __s1 = __p1; \
-  int16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
-  poly8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \
-})
-#else
-#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
-  poly8x8x3_t __s1 = __p1; \
-  poly8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1_p64_x3(__p0, __p1) __extension__ ({ \
   poly64x1x3_t __s1 = __p1; \
   __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
@@ -62055,38 +66635,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
-  poly16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \
-})
-#else
-#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
-  poly16x4x3_t __s1 = __p1; \
-  poly16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
-  poly8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \
-})
-#else
-#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
-  poly8x16x3_t __s1 = __p1; \
-  poly8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1q_p64_x3(__p0, __p1) __extension__ ({ \
   poly64x2x3_t __s1 = __p1; \
   __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 38); \
@@ -62103,102 +66651,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
-  poly16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \
-})
-#else
-#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
-  poly16x8x3_t __s1 = __p1; \
-  poly16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
-  uint8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \
-})
-#else
-#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
-  uint8x16x3_t __s1 = __p1; \
-  uint8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
-  uint32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \
-})
-#else
-#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
-  uint32x4x3_t __s1 = __p1; \
-  uint32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
-  uint64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \
-})
-#else
-#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
-  uint64x2x3_t __s1 = __p1; \
-  uint64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
-  uint16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \
-})
-#else
-#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
-  uint16x8x3_t __s1 = __p1; \
-  uint16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
-  int8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \
-})
-#else
-#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
-  int8x16x3_t __s1 = __p1; \
-  int8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1q_f64_x3(__p0, __p1) __extension__ ({ \
   float64x2x3_t __s1 = __p1; \
   __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 42); \
@@ -62215,162 +66667,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
-  float32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 41); \
-})
-#else
-#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
-  float32x4x3_t __s1 = __p1; \
-  float32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
-  float16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 40); \
-})
-#else
-#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
-  float16x8x3_t __s1 = __p1; \
-  float16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
-  int32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 34); \
-})
-#else
-#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
-  int32x4x3_t __s1 = __p1; \
-  int32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
-  int64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 35); \
-})
-#else
-#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
-  int64x2x3_t __s1 = __p1; \
-  int64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 35); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
-  int16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 33); \
-})
-#else
-#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
-  int16x8x3_t __s1 = __p1; \
-  int16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
-  uint8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \
-})
-#else
-#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
-  uint8x8x3_t __s1 = __p1; \
-  uint8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
-  uint32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \
-})
-#else
-#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
-  uint32x2x3_t __s1 = __p1; \
-  uint32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
-  uint64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
-})
-#else
-#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
-  uint64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
-  uint16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \
-})
-#else
-#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
-  uint16x4x3_t __s1 = __p1; \
-  uint16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
-  int8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \
-})
-#else
-#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
-  int8x8x3_t __s1 = __p1; \
-  int8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1_f64_x3(__p0, __p1) __extension__ ({ \
   float64x1x3_t __s1 = __p1; \
   __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
@@ -62383,99 +66679,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
-  float32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 9); \
-})
-#else
-#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
-  float32x2x3_t __s1 = __p1; \
-  float32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
-  float16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 8); \
-})
-#else
-#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
-  float16x4x3_t __s1 = __p1; \
-  float16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
-  int32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 2); \
-})
-#else
-#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
-  int32x2x3_t __s1 = __p1; \
-  int32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 2); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
-  int64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
-})
-#else
-#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
-  int64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
-  int16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 1); \
-})
-#else
-#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
-  int16x4x3_t __s1 = __p1; \
-  int16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
-  poly8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \
-})
-#else
-#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
-  poly8x8x4_t __s1 = __p1; \
-  poly8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1_p64_x4(__p0, __p1) __extension__ ({ \
   poly64x1x4_t __s1 = __p1; \
   __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
@@ -62488,40 +66691,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
-  poly16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \
-})
-#else
-#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
-  poly16x4x4_t __s1 = __p1; \
-  poly16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
-  poly8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \
-})
-#else
-#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
-  poly8x16x4_t __s1 = __p1; \
-  poly8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1q_p64_x4(__p0, __p1) __extension__ ({ \
   poly64x2x4_t __s1 = __p1; \
   __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 38); \
@@ -62539,108 +66708,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
-  poly16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \
-})
-#else
-#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
-  poly16x8x4_t __s1 = __p1; \
-  poly16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
-  uint8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \
-})
-#else
-#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
-  uint8x16x4_t __s1 = __p1; \
-  uint8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
-  uint32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \
-})
-#else
-#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
-  uint32x4x4_t __s1 = __p1; \
-  uint32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
-  uint64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \
-})
-#else
-#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
-  uint64x2x4_t __s1 = __p1; \
-  uint64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
-  uint16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \
-})
-#else
-#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
-  uint16x8x4_t __s1 = __p1; \
-  uint16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
-  int8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \
-})
-#else
-#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
-  int8x16x4_t __s1 = __p1; \
-  int8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1q_f64_x4(__p0, __p1) __extension__ ({ \
   float64x2x4_t __s1 = __p1; \
   __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 42); \
@@ -62658,171 +66725,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
-  float32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 41); \
-})
-#else
-#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
-  float32x4x4_t __s1 = __p1; \
-  float32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
-  float16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 40); \
-})
-#else
-#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
-  float16x8x4_t __s1 = __p1; \
-  float16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
-  int32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 34); \
-})
-#else
-#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
-  int32x4x4_t __s1 = __p1; \
-  int32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
-  int64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 35); \
-})
-#else
-#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
-  int64x2x4_t __s1 = __p1; \
-  int64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 35); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
-  int16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 33); \
-})
-#else
-#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
-  int16x8x4_t __s1 = __p1; \
-  int16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
-  uint8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \
-})
-#else
-#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
-  uint8x8x4_t __s1 = __p1; \
-  uint8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
-  uint32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \
-})
-#else
-#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
-  uint32x2x4_t __s1 = __p1; \
-  uint32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
-  uint64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
-})
-#else
-#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
-  uint64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
-  uint16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \
-})
-#else
-#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
-  uint16x4x4_t __s1 = __p1; \
-  uint16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
-  int8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \
-})
-#else
-#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
-  int8x8x4_t __s1 = __p1; \
-  int8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst1_f64_x4(__p0, __p1) __extension__ ({ \
   float64x1x4_t __s1 = __p1; \
   __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
@@ -62835,86 +66737,6 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
-  float32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 9); \
-})
-#else
-#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
-  float32x2x4_t __s1 = __p1; \
-  float32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
-  float16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 8); \
-})
-#else
-#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
-  float16x4x4_t __s1 = __p1; \
-  float16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
-  int32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 2); \
-})
-#else
-#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
-  int32x2x4_t __s1 = __p1; \
-  int32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 2); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
-  int64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
-})
-#else
-#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
-  int64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
-  int16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 1); \
-})
-#else
-#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
-  int16x4x4_t __s1 = __p1; \
-  int16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
 #define vst2_p64(__p0, __p1) __extension__ ({ \
   poly64x1x2_t __s1 = __p1; \
   __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
@@ -67149,44 +70971,60 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vget_lane_f16(__p0_242, __p1_242) __extension__ ({ \
-  float16x4_t __s0_242 = __p0_242; \
-  float16_t __ret_242; \
-float16x4_t __reint_242 = __s0_242; \
-int16_t __reint1_242 = vget_lane_s16(*(int16x4_t *) &__reint_242, __p1_242); \
-  __ret_242 = *(float16_t *) &__reint1_242; \
-  __ret_242; \
+#define vget_lane_f16(__p0_254, __p1_254) __extension__ ({ \
+  float16x4_t __s0_254 = __p0_254; \
+  float16_t __ret_254; \
+float16x4_t __reint_254 = __s0_254; \
+int16_t __reint1_254 = vget_lane_s16(*(int16x4_t *) &__reint_254, __p1_254); \
+  __ret_254 = *(float16_t *) &__reint1_254; \
+  __ret_254; \
 })
 #else
-#define vget_lane_f16(__p0_243, __p1_243) __extension__ ({ \
-  float16x4_t __s0_243 = __p0_243; \
-  float16x4_t __rev0_243;  __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 3, 2, 1, 0); \
-  float16_t __ret_243; \
-float16x4_t __reint_243 = __rev0_243; \
-int16_t __reint1_243 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_243, __p1_243); \
-  __ret_243 = *(float16_t *) &__reint1_243; \
-  __ret_243; \
+#define vget_lane_f16(__p0_255, __p1_255) __extension__ ({ \
+  float16x4_t __s0_255 = __p0_255; \
+  float16x4_t __rev0_255;  __rev0_255 = __builtin_shufflevector(__s0_255, __s0_255, 3, 2, 1, 0); \
+  float16_t __ret_255; \
+float16x4_t __reint_255 = __rev0_255; \
+int16_t __reint1_255 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_255, __p1_255); \
+  __ret_255 = *(float16_t *) &__reint1_255; \
+  __ret_255; \
+})
+#define __noswap_vget_lane_f16(__p0_256, __p1_256) __extension__ ({ \
+  float16x4_t __s0_256 = __p0_256; \
+  float16_t __ret_256; \
+float16x4_t __reint_256 = __s0_256; \
+int16_t __reint1_256 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_256, __p1_256); \
+  __ret_256 = *(float16_t *) &__reint1_256; \
+  __ret_256; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_f16(__p0_244, __p1_244) __extension__ ({ \
-  float16x8_t __s0_244 = __p0_244; \
-  float16_t __ret_244; \
-float16x8_t __reint_244 = __s0_244; \
-int16_t __reint1_244 = vgetq_lane_s16(*(int16x8_t *) &__reint_244, __p1_244); \
-  __ret_244 = *(float16_t *) &__reint1_244; \
-  __ret_244; \
+#define vgetq_lane_f16(__p0_257, __p1_257) __extension__ ({ \
+  float16x8_t __s0_257 = __p0_257; \
+  float16_t __ret_257; \
+float16x8_t __reint_257 = __s0_257; \
+int16_t __reint1_257 = vgetq_lane_s16(*(int16x8_t *) &__reint_257, __p1_257); \
+  __ret_257 = *(float16_t *) &__reint1_257; \
+  __ret_257; \
 })
 #else
-#define vgetq_lane_f16(__p0_245, __p1_245) __extension__ ({ \
-  float16x8_t __s0_245 = __p0_245; \
-  float16x8_t __rev0_245;  __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16_t __ret_245; \
-float16x8_t __reint_245 = __rev0_245; \
-int16_t __reint1_245 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_245, __p1_245); \
-  __ret_245 = *(float16_t *) &__reint1_245; \
-  __ret_245; \
+#define vgetq_lane_f16(__p0_258, __p1_258) __extension__ ({ \
+  float16x8_t __s0_258 = __p0_258; \
+  float16x8_t __rev0_258;  __rev0_258 = __builtin_shufflevector(__s0_258, __s0_258, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret_258; \
+float16x8_t __reint_258 = __rev0_258; \
+int16_t __reint1_258 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_258, __p1_258); \
+  __ret_258 = *(float16_t *) &__reint1_258; \
+  __ret_258; \
+})
+#define __noswap_vgetq_lane_f16(__p0_259, __p1_259) __extension__ ({ \
+  float16x8_t __s0_259 = __p0_259; \
+  float16_t __ret_259; \
+float16x8_t __reint_259 = __s0_259; \
+int16_t __reint1_259 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_259, __p1_259); \
+  __ret_259 = *(float16_t *) &__reint1_259; \
+  __ret_259; \
 })
 #endif
 
@@ -67835,57 +71673,97 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vset_lane_f16(__p0_246, __p1_246, __p2_246) __extension__ ({ \
-  float16_t __s0_246 = __p0_246; \
-  float16x4_t __s1_246 = __p1_246; \
-  float16x4_t __ret_246; \
-float16_t __reint_246 = __s0_246; \
-float16x4_t __reint1_246 = __s1_246; \
-int16x4_t __reint2_246 = vset_lane_s16(*(int16_t *) &__reint_246, *(int16x4_t *) &__reint1_246, __p2_246); \
-  __ret_246 = *(float16x4_t *) &__reint2_246; \
-  __ret_246; \
+#define vset_lane_f16(__p0_260, __p1_260, __p2_260) __extension__ ({ \
+  float16_t __s0_260 = __p0_260; \
+  float16x4_t __s1_260 = __p1_260; \
+  float16x4_t __ret_260; \
+float16_t __reint_260 = __s0_260; \
+float16x4_t __reint1_260 = __s1_260; \
+int16x4_t __reint2_260 = vset_lane_s16(*(int16_t *) &__reint_260, *(int16x4_t *) &__reint1_260, __p2_260); \
+  __ret_260 = *(float16x4_t *) &__reint2_260; \
+  __ret_260; \
 })
 #else
-#define vset_lane_f16(__p0_247, __p1_247, __p2_247) __extension__ ({ \
-  float16_t __s0_247 = __p0_247; \
-  float16x4_t __s1_247 = __p1_247; \
-  float16x4_t __rev1_247;  __rev1_247 = __builtin_shufflevector(__s1_247, __s1_247, 3, 2, 1, 0); \
-  float16x4_t __ret_247; \
-float16_t __reint_247 = __s0_247; \
-float16x4_t __reint1_247 = __rev1_247; \
-int16x4_t __reint2_247 = __noswap_vset_lane_s16(*(int16_t *) &__reint_247, *(int16x4_t *) &__reint1_247, __p2_247); \
-  __ret_247 = *(float16x4_t *) &__reint2_247; \
-  __ret_247 = __builtin_shufflevector(__ret_247, __ret_247, 3, 2, 1, 0); \
-  __ret_247; \
+#define vset_lane_f16(__p0_261, __p1_261, __p2_261) __extension__ ({ \
+  float16_t __s0_261 = __p0_261; \
+  float16x4_t __s1_261 = __p1_261; \
+  float16x4_t __rev1_261;  __rev1_261 = __builtin_shufflevector(__s1_261, __s1_261, 3, 2, 1, 0); \
+  float16x4_t __ret_261; \
+float16_t __reint_261 = __s0_261; \
+float16x4_t __reint1_261 = __rev1_261; \
+int16x4_t __reint2_261 = __noswap_vset_lane_s16(*(int16_t *) &__reint_261, *(int16x4_t *) &__reint1_261, __p2_261); \
+  __ret_261 = *(float16x4_t *) &__reint2_261; \
+  __ret_261 = __builtin_shufflevector(__ret_261, __ret_261, 3, 2, 1, 0); \
+  __ret_261; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_f16(__p0_248, __p1_248, __p2_248) __extension__ ({ \
-  float16_t __s0_248 = __p0_248; \
-  float16x8_t __s1_248 = __p1_248; \
-  float16x8_t __ret_248; \
-float16_t __reint_248 = __s0_248; \
-float16x8_t __reint1_248 = __s1_248; \
-int16x8_t __reint2_248 = vsetq_lane_s16(*(int16_t *) &__reint_248, *(int16x8_t *) &__reint1_248, __p2_248); \
-  __ret_248 = *(float16x8_t *) &__reint2_248; \
-  __ret_248; \
+#define vsetq_lane_f16(__p0_262, __p1_262, __p2_262) __extension__ ({ \
+  float16_t __s0_262 = __p0_262; \
+  float16x8_t __s1_262 = __p1_262; \
+  float16x8_t __ret_262; \
+float16_t __reint_262 = __s0_262; \
+float16x8_t __reint1_262 = __s1_262; \
+int16x8_t __reint2_262 = vsetq_lane_s16(*(int16_t *) &__reint_262, *(int16x8_t *) &__reint1_262, __p2_262); \
+  __ret_262 = *(float16x8_t *) &__reint2_262; \
+  __ret_262; \
 })
 #else
-#define vsetq_lane_f16(__p0_249, __p1_249, __p2_249) __extension__ ({ \
-  float16_t __s0_249 = __p0_249; \
-  float16x8_t __s1_249 = __p1_249; \
-  float16x8_t __rev1_249;  __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __ret_249; \
-float16_t __reint_249 = __s0_249; \
-float16x8_t __reint1_249 = __rev1_249; \
-int16x8_t __reint2_249 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_249, *(int16x8_t *) &__reint1_249, __p2_249); \
-  __ret_249 = *(float16x8_t *) &__reint2_249; \
-  __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_249; \
+#define vsetq_lane_f16(__p0_263, __p1_263, __p2_263) __extension__ ({ \
+  float16_t __s0_263 = __p0_263; \
+  float16x8_t __s1_263 = __p1_263; \
+  float16x8_t __rev1_263;  __rev1_263 = __builtin_shufflevector(__s1_263, __s1_263, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_263; \
+float16_t __reint_263 = __s0_263; \
+float16x8_t __reint1_263 = __rev1_263; \
+int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(int16x8_t *) &__reint1_263, __p2_263); \
+  __ret_263 = *(float16x8_t *) &__reint2_263; \
+  __ret_263 = __builtin_shufflevector(__ret_263, __ret_263, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_263; \
 })
 #endif
 
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+#define vmulh_lane_f16(__p0_264, __p1_264, __p2_264) __extension__ ({ \
+  float16_t __s0_264 = __p0_264; \
+  float16x4_t __s1_264 = __p1_264; \
+  float16_t __ret_264; \
+  __ret_264 = __s0_264 * vget_lane_f16(__s1_264, __p2_264); \
+  __ret_264; \
+})
+#else
+#define vmulh_lane_f16(__p0_265, __p1_265, __p2_265) __extension__ ({ \
+  float16_t __s0_265 = __p0_265; \
+  float16x4_t __s1_265 = __p1_265; \
+  float16x4_t __rev1_265;  __rev1_265 = __builtin_shufflevector(__s1_265, __s1_265, 3, 2, 1, 0); \
+  float16_t __ret_265; \
+  __ret_265 = __s0_265 * __noswap_vget_lane_f16(__rev1_265, __p2_265); \
+  __ret_265; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulh_laneq_f16(__p0_266, __p1_266, __p2_266) __extension__ ({ \
+  float16_t __s0_266 = __p0_266; \
+  float16x8_t __s1_266 = __p1_266; \
+  float16_t __ret_266; \
+  __ret_266 = __s0_266 * vgetq_lane_f16(__s1_266, __p2_266); \
+  __ret_266; \
+})
+#else
+#define vmulh_laneq_f16(__p0_267, __p1_267, __p2_267) __extension__ ({ \
+  float16_t __s0_267 = __p0_267; \
+  float16x8_t __s1_267 = __p1_267; \
+  float16x8_t __rev1_267;  __rev1_267 = __builtin_shufflevector(__s1_267, __s1_267, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret_267; \
+  __ret_267 = __s0_267 * __noswap_vgetq_lane_f16(__rev1_267, __p2_267); \
+  __ret_267; \
+})
+#endif
+
+#endif
 #if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
 __ai int32_t vqrdmlahs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
@@ -67916,86 +71794,86 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahs_lane_s32(__p0_250, __p1_250, __p2_250, __p3_250) __extension__ ({ \
-  int32_t __s0_250 = __p0_250; \
-  int32_t __s1_250 = __p1_250; \
-  int32x2_t __s2_250 = __p2_250; \
-  int32_t __ret_250; \
-  __ret_250 = vqadds_s32(__s0_250, vqrdmulhs_s32(__s1_250, vget_lane_s32(__s2_250, __p3_250))); \
-  __ret_250; \
+#define vqrdmlahs_lane_s32(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
+  int32_t __s0_268 = __p0_268; \
+  int32_t __s1_268 = __p1_268; \
+  int32x2_t __s2_268 = __p2_268; \
+  int32_t __ret_268; \
+  __ret_268 = vqadds_s32(__s0_268, vqrdmulhs_s32(__s1_268, vget_lane_s32(__s2_268, __p3_268))); \
+  __ret_268; \
 })
 #else
-#define vqrdmlahs_lane_s32(__p0_251, __p1_251, __p2_251, __p3_251) __extension__ ({ \
-  int32_t __s0_251 = __p0_251; \
-  int32_t __s1_251 = __p1_251; \
-  int32x2_t __s2_251 = __p2_251; \
-  int32x2_t __rev2_251;  __rev2_251 = __builtin_shufflevector(__s2_251, __s2_251, 1, 0); \
-  int32_t __ret_251; \
-  __ret_251 = __noswap_vqadds_s32(__s0_251, __noswap_vqrdmulhs_s32(__s1_251, __noswap_vget_lane_s32(__rev2_251, __p3_251))); \
-  __ret_251; \
+#define vqrdmlahs_lane_s32(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
+  int32_t __s0_269 = __p0_269; \
+  int32_t __s1_269 = __p1_269; \
+  int32x2_t __s2_269 = __p2_269; \
+  int32x2_t __rev2_269;  __rev2_269 = __builtin_shufflevector(__s2_269, __s2_269, 1, 0); \
+  int32_t __ret_269; \
+  __ret_269 = __noswap_vqadds_s32(__s0_269, __noswap_vqrdmulhs_s32(__s1_269, __noswap_vget_lane_s32(__rev2_269, __p3_269))); \
+  __ret_269; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahh_lane_s16(__p0_252, __p1_252, __p2_252, __p3_252) __extension__ ({ \
-  int16_t __s0_252 = __p0_252; \
-  int16_t __s1_252 = __p1_252; \
-  int16x4_t __s2_252 = __p2_252; \
-  int16_t __ret_252; \
-  __ret_252 = vqaddh_s16(__s0_252, vqrdmulhh_s16(__s1_252, vget_lane_s16(__s2_252, __p3_252))); \
-  __ret_252; \
+#define vqrdmlahh_lane_s16(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
+  int16_t __s0_270 = __p0_270; \
+  int16_t __s1_270 = __p1_270; \
+  int16x4_t __s2_270 = __p2_270; \
+  int16_t __ret_270; \
+  __ret_270 = vqaddh_s16(__s0_270, vqrdmulhh_s16(__s1_270, vget_lane_s16(__s2_270, __p3_270))); \
+  __ret_270; \
 })
 #else
-#define vqrdmlahh_lane_s16(__p0_253, __p1_253, __p2_253, __p3_253) __extension__ ({ \
-  int16_t __s0_253 = __p0_253; \
-  int16_t __s1_253 = __p1_253; \
-  int16x4_t __s2_253 = __p2_253; \
-  int16x4_t __rev2_253;  __rev2_253 = __builtin_shufflevector(__s2_253, __s2_253, 3, 2, 1, 0); \
-  int16_t __ret_253; \
-  __ret_253 = __noswap_vqaddh_s16(__s0_253, __noswap_vqrdmulhh_s16(__s1_253, __noswap_vget_lane_s16(__rev2_253, __p3_253))); \
-  __ret_253; \
+#define vqrdmlahh_lane_s16(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
+  int16_t __s0_271 = __p0_271; \
+  int16_t __s1_271 = __p1_271; \
+  int16x4_t __s2_271 = __p2_271; \
+  int16x4_t __rev2_271;  __rev2_271 = __builtin_shufflevector(__s2_271, __s2_271, 3, 2, 1, 0); \
+  int16_t __ret_271; \
+  __ret_271 = __noswap_vqaddh_s16(__s0_271, __noswap_vqrdmulhh_s16(__s1_271, __noswap_vget_lane_s16(__rev2_271, __p3_271))); \
+  __ret_271; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahs_laneq_s32(__p0_254, __p1_254, __p2_254, __p3_254) __extension__ ({ \
-  int32_t __s0_254 = __p0_254; \
-  int32_t __s1_254 = __p1_254; \
-  int32x4_t __s2_254 = __p2_254; \
-  int32_t __ret_254; \
-  __ret_254 = vqadds_s32(__s0_254, vqrdmulhs_s32(__s1_254, vgetq_lane_s32(__s2_254, __p3_254))); \
-  __ret_254; \
+#define vqrdmlahs_laneq_s32(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
+  int32_t __s0_272 = __p0_272; \
+  int32_t __s1_272 = __p1_272; \
+  int32x4_t __s2_272 = __p2_272; \
+  int32_t __ret_272; \
+  __ret_272 = vqadds_s32(__s0_272, vqrdmulhs_s32(__s1_272, vgetq_lane_s32(__s2_272, __p3_272))); \
+  __ret_272; \
 })
 #else
-#define vqrdmlahs_laneq_s32(__p0_255, __p1_255, __p2_255, __p3_255) __extension__ ({ \
-  int32_t __s0_255 = __p0_255; \
-  int32_t __s1_255 = __p1_255; \
-  int32x4_t __s2_255 = __p2_255; \
-  int32x4_t __rev2_255;  __rev2_255 = __builtin_shufflevector(__s2_255, __s2_255, 3, 2, 1, 0); \
-  int32_t __ret_255; \
-  __ret_255 = __noswap_vqadds_s32(__s0_255, __noswap_vqrdmulhs_s32(__s1_255, __noswap_vgetq_lane_s32(__rev2_255, __p3_255))); \
-  __ret_255; \
+#define vqrdmlahs_laneq_s32(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
+  int32_t __s0_273 = __p0_273; \
+  int32_t __s1_273 = __p1_273; \
+  int32x4_t __s2_273 = __p2_273; \
+  int32x4_t __rev2_273;  __rev2_273 = __builtin_shufflevector(__s2_273, __s2_273, 3, 2, 1, 0); \
+  int32_t __ret_273; \
+  __ret_273 = __noswap_vqadds_s32(__s0_273, __noswap_vqrdmulhs_s32(__s1_273, __noswap_vgetq_lane_s32(__rev2_273, __p3_273))); \
+  __ret_273; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahh_laneq_s16(__p0_256, __p1_256, __p2_256, __p3_256) __extension__ ({ \
-  int16_t __s0_256 = __p0_256; \
-  int16_t __s1_256 = __p1_256; \
-  int16x8_t __s2_256 = __p2_256; \
-  int16_t __ret_256; \
-  __ret_256 = vqaddh_s16(__s0_256, vqrdmulhh_s16(__s1_256, vgetq_lane_s16(__s2_256, __p3_256))); \
-  __ret_256; \
+#define vqrdmlahh_laneq_s16(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
+  int16_t __s0_274 = __p0_274; \
+  int16_t __s1_274 = __p1_274; \
+  int16x8_t __s2_274 = __p2_274; \
+  int16_t __ret_274; \
+  __ret_274 = vqaddh_s16(__s0_274, vqrdmulhh_s16(__s1_274, vgetq_lane_s16(__s2_274, __p3_274))); \
+  __ret_274; \
 })
 #else
-#define vqrdmlahh_laneq_s16(__p0_257, __p1_257, __p2_257, __p3_257) __extension__ ({ \
-  int16_t __s0_257 = __p0_257; \
-  int16_t __s1_257 = __p1_257; \
-  int16x8_t __s2_257 = __p2_257; \
-  int16x8_t __rev2_257;  __rev2_257 = __builtin_shufflevector(__s2_257, __s2_257, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16_t __ret_257; \
-  __ret_257 = __noswap_vqaddh_s16(__s0_257, __noswap_vqrdmulhh_s16(__s1_257, __noswap_vgetq_lane_s16(__rev2_257, __p3_257))); \
-  __ret_257; \
+#define vqrdmlahh_laneq_s16(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
+  int16_t __s0_275 = __p0_275; \
+  int16_t __s1_275 = __p1_275; \
+  int16x8_t __s2_275 = __p2_275; \
+  int16x8_t __rev2_275;  __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_275; \
+  __ret_275 = __noswap_vqaddh_s16(__s0_275, __noswap_vqrdmulhh_s16(__s1_275, __noswap_vgetq_lane_s16(__rev2_275, __p3_275))); \
+  __ret_275; \
 })
 #endif
 
@@ -68028,86 +71906,86 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshs_lane_s32(__p0_258, __p1_258, __p2_258, __p3_258) __extension__ ({ \
-  int32_t __s0_258 = __p0_258; \
-  int32_t __s1_258 = __p1_258; \
-  int32x2_t __s2_258 = __p2_258; \
-  int32_t __ret_258; \
-  __ret_258 = vqsubs_s32(__s0_258, vqrdmulhs_s32(__s1_258, vget_lane_s32(__s2_258, __p3_258))); \
-  __ret_258; \
+#define vqrdmlshs_lane_s32(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
+  int32_t __s0_276 = __p0_276; \
+  int32_t __s1_276 = __p1_276; \
+  int32x2_t __s2_276 = __p2_276; \
+  int32_t __ret_276; \
+  __ret_276 = vqsubs_s32(__s0_276, vqrdmulhs_s32(__s1_276, vget_lane_s32(__s2_276, __p3_276))); \
+  __ret_276; \
 })
 #else
-#define vqrdmlshs_lane_s32(__p0_259, __p1_259, __p2_259, __p3_259) __extension__ ({ \
-  int32_t __s0_259 = __p0_259; \
-  int32_t __s1_259 = __p1_259; \
-  int32x2_t __s2_259 = __p2_259; \
-  int32x2_t __rev2_259;  __rev2_259 = __builtin_shufflevector(__s2_259, __s2_259, 1, 0); \
-  int32_t __ret_259; \
-  __ret_259 = __noswap_vqsubs_s32(__s0_259, __noswap_vqrdmulhs_s32(__s1_259, __noswap_vget_lane_s32(__rev2_259, __p3_259))); \
-  __ret_259; \
+#define vqrdmlshs_lane_s32(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
+  int32_t __s0_277 = __p0_277; \
+  int32_t __s1_277 = __p1_277; \
+  int32x2_t __s2_277 = __p2_277; \
+  int32x2_t __rev2_277;  __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 1, 0); \
+  int32_t __ret_277; \
+  __ret_277 = __noswap_vqsubs_s32(__s0_277, __noswap_vqrdmulhs_s32(__s1_277, __noswap_vget_lane_s32(__rev2_277, __p3_277))); \
+  __ret_277; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshh_lane_s16(__p0_260, __p1_260, __p2_260, __p3_260) __extension__ ({ \
-  int16_t __s0_260 = __p0_260; \
-  int16_t __s1_260 = __p1_260; \
-  int16x4_t __s2_260 = __p2_260; \
-  int16_t __ret_260; \
-  __ret_260 = vqsubh_s16(__s0_260, vqrdmulhh_s16(__s1_260, vget_lane_s16(__s2_260, __p3_260))); \
-  __ret_260; \
+#define vqrdmlshh_lane_s16(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
+  int16_t __s0_278 = __p0_278; \
+  int16_t __s1_278 = __p1_278; \
+  int16x4_t __s2_278 = __p2_278; \
+  int16_t __ret_278; \
+  __ret_278 = vqsubh_s16(__s0_278, vqrdmulhh_s16(__s1_278, vget_lane_s16(__s2_278, __p3_278))); \
+  __ret_278; \
 })
 #else
-#define vqrdmlshh_lane_s16(__p0_261, __p1_261, __p2_261, __p3_261) __extension__ ({ \
-  int16_t __s0_261 = __p0_261; \
-  int16_t __s1_261 = __p1_261; \
-  int16x4_t __s2_261 = __p2_261; \
-  int16x4_t __rev2_261;  __rev2_261 = __builtin_shufflevector(__s2_261, __s2_261, 3, 2, 1, 0); \
-  int16_t __ret_261; \
-  __ret_261 = __noswap_vqsubh_s16(__s0_261, __noswap_vqrdmulhh_s16(__s1_261, __noswap_vget_lane_s16(__rev2_261, __p3_261))); \
-  __ret_261; \
+#define vqrdmlshh_lane_s16(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
+  int16_t __s0_279 = __p0_279; \
+  int16_t __s1_279 = __p1_279; \
+  int16x4_t __s2_279 = __p2_279; \
+  int16x4_t __rev2_279;  __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 3, 2, 1, 0); \
+  int16_t __ret_279; \
+  __ret_279 = __noswap_vqsubh_s16(__s0_279, __noswap_vqrdmulhh_s16(__s1_279, __noswap_vget_lane_s16(__rev2_279, __p3_279))); \
+  __ret_279; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshs_laneq_s32(__p0_262, __p1_262, __p2_262, __p3_262) __extension__ ({ \
-  int32_t __s0_262 = __p0_262; \
-  int32_t __s1_262 = __p1_262; \
-  int32x4_t __s2_262 = __p2_262; \
-  int32_t __ret_262; \
-  __ret_262 = vqsubs_s32(__s0_262, vqrdmulhs_s32(__s1_262, vgetq_lane_s32(__s2_262, __p3_262))); \
-  __ret_262; \
+#define vqrdmlshs_laneq_s32(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
+  int32_t __s0_280 = __p0_280; \
+  int32_t __s1_280 = __p1_280; \
+  int32x4_t __s2_280 = __p2_280; \
+  int32_t __ret_280; \
+  __ret_280 = vqsubs_s32(__s0_280, vqrdmulhs_s32(__s1_280, vgetq_lane_s32(__s2_280, __p3_280))); \
+  __ret_280; \
 })
 #else
-#define vqrdmlshs_laneq_s32(__p0_263, __p1_263, __p2_263, __p3_263) __extension__ ({ \
-  int32_t __s0_263 = __p0_263; \
-  int32_t __s1_263 = __p1_263; \
-  int32x4_t __s2_263 = __p2_263; \
-  int32x4_t __rev2_263;  __rev2_263 = __builtin_shufflevector(__s2_263, __s2_263, 3, 2, 1, 0); \
-  int32_t __ret_263; \
-  __ret_263 = __noswap_vqsubs_s32(__s0_263, __noswap_vqrdmulhs_s32(__s1_263, __noswap_vgetq_lane_s32(__rev2_263, __p3_263))); \
-  __ret_263; \
+#define vqrdmlshs_laneq_s32(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
+  int32_t __s0_281 = __p0_281; \
+  int32_t __s1_281 = __p1_281; \
+  int32x4_t __s2_281 = __p2_281; \
+  int32x4_t __rev2_281;  __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 3, 2, 1, 0); \
+  int32_t __ret_281; \
+  __ret_281 = __noswap_vqsubs_s32(__s0_281, __noswap_vqrdmulhs_s32(__s1_281, __noswap_vgetq_lane_s32(__rev2_281, __p3_281))); \
+  __ret_281; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshh_laneq_s16(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
-  int16_t __s0_264 = __p0_264; \
-  int16_t __s1_264 = __p1_264; \
-  int16x8_t __s2_264 = __p2_264; \
-  int16_t __ret_264; \
-  __ret_264 = vqsubh_s16(__s0_264, vqrdmulhh_s16(__s1_264, vgetq_lane_s16(__s2_264, __p3_264))); \
-  __ret_264; \
+#define vqrdmlshh_laneq_s16(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \
+  int16_t __s0_282 = __p0_282; \
+  int16_t __s1_282 = __p1_282; \
+  int16x8_t __s2_282 = __p2_282; \
+  int16_t __ret_282; \
+  __ret_282 = vqsubh_s16(__s0_282, vqrdmulhh_s16(__s1_282, vgetq_lane_s16(__s2_282, __p3_282))); \
+  __ret_282; \
 })
 #else
-#define vqrdmlshh_laneq_s16(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
-  int16_t __s0_265 = __p0_265; \
-  int16_t __s1_265 = __p1_265; \
-  int16x8_t __s2_265 = __p2_265; \
-  int16x8_t __rev2_265;  __rev2_265 = __builtin_shufflevector(__s2_265, __s2_265, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16_t __ret_265; \
-  __ret_265 = __noswap_vqsubh_s16(__s0_265, __noswap_vqrdmulhh_s16(__s1_265, __noswap_vgetq_lane_s16(__rev2_265, __p3_265))); \
-  __ret_265; \
+#define vqrdmlshh_laneq_s16(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \
+  int16_t __s0_283 = __p0_283; \
+  int16_t __s1_283 = __p1_283; \
+  int16x8_t __s2_283 = __p2_283; \
+  int16x8_t __rev2_283;  __rev2_283 = __builtin_shufflevector(__s2_283, __s2_283, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_283; \
+  __ret_283 = __noswap_vqsubh_s16(__s0_283, __noswap_vqrdmulhh_s16(__s1_283, __noswap_vgetq_lane_s16(__rev2_283, __p3_283))); \
+  __ret_283; \
 })
 #endif
 
@@ -68420,158 +72298,158 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p64(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
-  poly64x2_t __s0_266 = __p0_266; \
-  poly64x1_t __s2_266 = __p2_266; \
-  poly64x2_t __ret_266; \
-  __ret_266 = vsetq_lane_p64(vget_lane_p64(__s2_266, __p3_266), __s0_266, __p1_266); \
-  __ret_266; \
+#define vcopyq_lane_p64(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \
+  poly64x2_t __s0_284 = __p0_284; \
+  poly64x1_t __s2_284 = __p2_284; \
+  poly64x2_t __ret_284; \
+  __ret_284 = vsetq_lane_p64(vget_lane_p64(__s2_284, __p3_284), __s0_284, __p1_284); \
+  __ret_284; \
 })
 #else
-#define vcopyq_lane_p64(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
-  poly64x2_t __s0_267 = __p0_267; \
-  poly64x1_t __s2_267 = __p2_267; \
-  poly64x2_t __rev0_267;  __rev0_267 = __builtin_shufflevector(__s0_267, __s0_267, 1, 0); \
-  poly64x2_t __ret_267; \
-  __ret_267 = __noswap_vsetq_lane_p64(__noswap_vget_lane_p64(__s2_267, __p3_267), __rev0_267, __p1_267); \
-  __ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 1, 0); \
-  __ret_267; \
+#define vcopyq_lane_p64(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \
+  poly64x2_t __s0_285 = __p0_285; \
+  poly64x1_t __s2_285 = __p2_285; \
+  poly64x2_t __rev0_285;  __rev0_285 = __builtin_shufflevector(__s0_285, __s0_285, 1, 0); \
+  poly64x2_t __ret_285; \
+  __ret_285 = __noswap_vsetq_lane_p64(__noswap_vget_lane_p64(__s2_285, __p3_285), __rev0_285, __p1_285); \
+  __ret_285 = __builtin_shufflevector(__ret_285, __ret_285, 1, 0); \
+  __ret_285; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_f64(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
-  float64x2_t __s0_268 = __p0_268; \
-  float64x1_t __s2_268 = __p2_268; \
-  float64x2_t __ret_268; \
-  __ret_268 = vsetq_lane_f64(vget_lane_f64(__s2_268, __p3_268), __s0_268, __p1_268); \
-  __ret_268; \
+#define vcopyq_lane_f64(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \
+  float64x2_t __s0_286 = __p0_286; \
+  float64x1_t __s2_286 = __p2_286; \
+  float64x2_t __ret_286; \
+  __ret_286 = vsetq_lane_f64(vget_lane_f64(__s2_286, __p3_286), __s0_286, __p1_286); \
+  __ret_286; \
 })
 #else
-#define vcopyq_lane_f64(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
-  float64x2_t __s0_269 = __p0_269; \
-  float64x1_t __s2_269 = __p2_269; \
-  float64x2_t __rev0_269;  __rev0_269 = __builtin_shufflevector(__s0_269, __s0_269, 1, 0); \
-  float64x2_t __ret_269; \
-  __ret_269 = __noswap_vsetq_lane_f64(__noswap_vget_lane_f64(__s2_269, __p3_269), __rev0_269, __p1_269); \
-  __ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 1, 0); \
-  __ret_269; \
+#define vcopyq_lane_f64(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \
+  float64x2_t __s0_287 = __p0_287; \
+  float64x1_t __s2_287 = __p2_287; \
+  float64x2_t __rev0_287;  __rev0_287 = __builtin_shufflevector(__s0_287, __s0_287, 1, 0); \
+  float64x2_t __ret_287; \
+  __ret_287 = __noswap_vsetq_lane_f64(__noswap_vget_lane_f64(__s2_287, __p3_287), __rev0_287, __p1_287); \
+  __ret_287 = __builtin_shufflevector(__ret_287, __ret_287, 1, 0); \
+  __ret_287; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_p64(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
-  poly64x1_t __s0_270 = __p0_270; \
-  poly64x1_t __s2_270 = __p2_270; \
-  poly64x1_t __ret_270; \
-  __ret_270 = vset_lane_p64(vget_lane_p64(__s2_270, __p3_270), __s0_270, __p1_270); \
-  __ret_270; \
+#define vcopy_lane_p64(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \
+  poly64x1_t __s0_288 = __p0_288; \
+  poly64x1_t __s2_288 = __p2_288; \
+  poly64x1_t __ret_288; \
+  __ret_288 = vset_lane_p64(vget_lane_p64(__s2_288, __p3_288), __s0_288, __p1_288); \
+  __ret_288; \
 })
 #else
-#define vcopy_lane_p64(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
-  poly64x1_t __s0_271 = __p0_271; \
-  poly64x1_t __s2_271 = __p2_271; \
-  poly64x1_t __ret_271; \
-  __ret_271 = __noswap_vset_lane_p64(__noswap_vget_lane_p64(__s2_271, __p3_271), __s0_271, __p1_271); \
-  __ret_271; \
+#define vcopy_lane_p64(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \
+  poly64x1_t __s0_289 = __p0_289; \
+  poly64x1_t __s2_289 = __p2_289; \
+  poly64x1_t __ret_289; \
+  __ret_289 = __noswap_vset_lane_p64(__noswap_vget_lane_p64(__s2_289, __p3_289), __s0_289, __p1_289); \
+  __ret_289; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_f64(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
-  float64x1_t __s0_272 = __p0_272; \
-  float64x1_t __s2_272 = __p2_272; \
-  float64x1_t __ret_272; \
-  __ret_272 = vset_lane_f64(vget_lane_f64(__s2_272, __p3_272), __s0_272, __p1_272); \
-  __ret_272; \
+#define vcopy_lane_f64(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \
+  float64x1_t __s0_290 = __p0_290; \
+  float64x1_t __s2_290 = __p2_290; \
+  float64x1_t __ret_290; \
+  __ret_290 = vset_lane_f64(vget_lane_f64(__s2_290, __p3_290), __s0_290, __p1_290); \
+  __ret_290; \
 })
 #else
-#define vcopy_lane_f64(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
-  float64x1_t __s0_273 = __p0_273; \
-  float64x1_t __s2_273 = __p2_273; \
-  float64x1_t __ret_273; \
-  __ret_273 = __noswap_vset_lane_f64(__noswap_vget_lane_f64(__s2_273, __p3_273), __s0_273, __p1_273); \
-  __ret_273; \
+#define vcopy_lane_f64(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \
+  float64x1_t __s0_291 = __p0_291; \
+  float64x1_t __s2_291 = __p2_291; \
+  float64x1_t __ret_291; \
+  __ret_291 = __noswap_vset_lane_f64(__noswap_vget_lane_f64(__s2_291, __p3_291), __s0_291, __p1_291); \
+  __ret_291; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p64(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
-  poly64x2_t __s0_274 = __p0_274; \
-  poly64x2_t __s2_274 = __p2_274; \
-  poly64x2_t __ret_274; \
-  __ret_274 = vsetq_lane_p64(vgetq_lane_p64(__s2_274, __p3_274), __s0_274, __p1_274); \
-  __ret_274; \
+#define vcopyq_laneq_p64(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \
+  poly64x2_t __s0_292 = __p0_292; \
+  poly64x2_t __s2_292 = __p2_292; \
+  poly64x2_t __ret_292; \
+  __ret_292 = vsetq_lane_p64(vgetq_lane_p64(__s2_292, __p3_292), __s0_292, __p1_292); \
+  __ret_292; \
 })
 #else
-#define vcopyq_laneq_p64(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
-  poly64x2_t __s0_275 = __p0_275; \
-  poly64x2_t __s2_275 = __p2_275; \
-  poly64x2_t __rev0_275;  __rev0_275 = __builtin_shufflevector(__s0_275, __s0_275, 1, 0); \
-  poly64x2_t __rev2_275;  __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 1, 0); \
-  poly64x2_t __ret_275; \
-  __ret_275 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_275, __p3_275), __rev0_275, __p1_275); \
-  __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 1, 0); \
-  __ret_275; \
+#define vcopyq_laneq_p64(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \
+  poly64x2_t __s0_293 = __p0_293; \
+  poly64x2_t __s2_293 = __p2_293; \
+  poly64x2_t __rev0_293;  __rev0_293 = __builtin_shufflevector(__s0_293, __s0_293, 1, 0); \
+  poly64x2_t __rev2_293;  __rev2_293 = __builtin_shufflevector(__s2_293, __s2_293, 1, 0); \
+  poly64x2_t __ret_293; \
+  __ret_293 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_293, __p3_293), __rev0_293, __p1_293); \
+  __ret_293 = __builtin_shufflevector(__ret_293, __ret_293, 1, 0); \
+  __ret_293; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_f64(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
-  float64x2_t __s0_276 = __p0_276; \
-  float64x2_t __s2_276 = __p2_276; \
-  float64x2_t __ret_276; \
-  __ret_276 = vsetq_lane_f64(vgetq_lane_f64(__s2_276, __p3_276), __s0_276, __p1_276); \
-  __ret_276; \
+#define vcopyq_laneq_f64(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \
+  float64x2_t __s0_294 = __p0_294; \
+  float64x2_t __s2_294 = __p2_294; \
+  float64x2_t __ret_294; \
+  __ret_294 = vsetq_lane_f64(vgetq_lane_f64(__s2_294, __p3_294), __s0_294, __p1_294); \
+  __ret_294; \
 })
 #else
-#define vcopyq_laneq_f64(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
-  float64x2_t __s0_277 = __p0_277; \
-  float64x2_t __s2_277 = __p2_277; \
-  float64x2_t __rev0_277;  __rev0_277 = __builtin_shufflevector(__s0_277, __s0_277, 1, 0); \
-  float64x2_t __rev2_277;  __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 1, 0); \
-  float64x2_t __ret_277; \
-  __ret_277 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_277, __p3_277), __rev0_277, __p1_277); \
-  __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 1, 0); \
-  __ret_277; \
+#define vcopyq_laneq_f64(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \
+  float64x2_t __s0_295 = __p0_295; \
+  float64x2_t __s2_295 = __p2_295; \
+  float64x2_t __rev0_295;  __rev0_295 = __builtin_shufflevector(__s0_295, __s0_295, 1, 0); \
+  float64x2_t __rev2_295;  __rev2_295 = __builtin_shufflevector(__s2_295, __s2_295, 1, 0); \
+  float64x2_t __ret_295; \
+  __ret_295 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_295, __p3_295), __rev0_295, __p1_295); \
+  __ret_295 = __builtin_shufflevector(__ret_295, __ret_295, 1, 0); \
+  __ret_295; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p64(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
-  poly64x1_t __s0_278 = __p0_278; \
-  poly64x2_t __s2_278 = __p2_278; \
-  poly64x1_t __ret_278; \
-  __ret_278 = vset_lane_p64(vgetq_lane_p64(__s2_278, __p3_278), __s0_278, __p1_278); \
-  __ret_278; \
+#define vcopy_laneq_p64(__p0_296, __p1_296, __p2_296, __p3_296) __extension__ ({ \
+  poly64x1_t __s0_296 = __p0_296; \
+  poly64x2_t __s2_296 = __p2_296; \
+  poly64x1_t __ret_296; \
+  __ret_296 = vset_lane_p64(vgetq_lane_p64(__s2_296, __p3_296), __s0_296, __p1_296); \
+  __ret_296; \
 })
 #else
-#define vcopy_laneq_p64(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
-  poly64x1_t __s0_279 = __p0_279; \
-  poly64x2_t __s2_279 = __p2_279; \
-  poly64x2_t __rev2_279;  __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 1, 0); \
-  poly64x1_t __ret_279; \
-  __ret_279 = __noswap_vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_279, __p3_279), __s0_279, __p1_279); \
-  __ret_279; \
+#define vcopy_laneq_p64(__p0_297, __p1_297, __p2_297, __p3_297) __extension__ ({ \
+  poly64x1_t __s0_297 = __p0_297; \
+  poly64x2_t __s2_297 = __p2_297; \
+  poly64x2_t __rev2_297;  __rev2_297 = __builtin_shufflevector(__s2_297, __s2_297, 1, 0); \
+  poly64x1_t __ret_297; \
+  __ret_297 = __noswap_vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_297, __p3_297), __s0_297, __p1_297); \
+  __ret_297; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_f64(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
-  float64x1_t __s0_280 = __p0_280; \
-  float64x2_t __s2_280 = __p2_280; \
-  float64x1_t __ret_280; \
-  __ret_280 = vset_lane_f64(vgetq_lane_f64(__s2_280, __p3_280), __s0_280, __p1_280); \
-  __ret_280; \
+#define vcopy_laneq_f64(__p0_298, __p1_298, __p2_298, __p3_298) __extension__ ({ \
+  float64x1_t __s0_298 = __p0_298; \
+  float64x2_t __s2_298 = __p2_298; \
+  float64x1_t __ret_298; \
+  __ret_298 = vset_lane_f64(vgetq_lane_f64(__s2_298, __p3_298), __s0_298, __p1_298); \
+  __ret_298; \
 })
 #else
-#define vcopy_laneq_f64(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
-  float64x1_t __s0_281 = __p0_281; \
-  float64x2_t __s2_281 = __p2_281; \
-  float64x2_t __rev2_281;  __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 1, 0); \
-  float64x1_t __ret_281; \
-  __ret_281 = __noswap_vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_281, __p3_281), __s0_281, __p1_281); \
-  __ret_281; \
+#define vcopy_laneq_f64(__p0_299, __p1_299, __p2_299, __p3_299) __extension__ ({ \
+  float64x1_t __s0_299 = __p0_299; \
+  float64x2_t __s2_299 = __p2_299; \
+  float64x2_t __rev2_299;  __rev2_299 = __builtin_shufflevector(__s2_299, __s2_299, 1, 0); \
+  float64x1_t __ret_299; \
+  __ret_299 = __noswap_vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_299, __p3_299), __s0_299, __p1_299); \
+  __ret_299; \
 })
 #endif
 
@@ -68928,51 +72806,51 @@
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulx_lane_f64(__p0_282, __p1_282, __p2_282) __extension__ ({ \
-  float64x1_t __s0_282 = __p0_282; \
-  float64x1_t __s1_282 = __p1_282; \
-  float64x1_t __ret_282; \
-  float64_t __x_282 = vget_lane_f64(__s0_282, 0); \
-  float64_t __y_282 = vget_lane_f64(__s1_282, __p2_282); \
-  float64_t __z_282 = vmulxd_f64(__x_282, __y_282); \
-  __ret_282 = vset_lane_f64(__z_282, __s0_282, __p2_282); \
-  __ret_282; \
+#define vmulx_lane_f64(__p0_300, __p1_300, __p2_300) __extension__ ({ \
+  float64x1_t __s0_300 = __p0_300; \
+  float64x1_t __s1_300 = __p1_300; \
+  float64x1_t __ret_300; \
+  float64_t __x_300 = vget_lane_f64(__s0_300, 0); \
+  float64_t __y_300 = vget_lane_f64(__s1_300, __p2_300); \
+  float64_t __z_300 = vmulxd_f64(__x_300, __y_300); \
+  __ret_300 = vset_lane_f64(__z_300, __s0_300, __p2_300); \
+  __ret_300; \
 })
 #else
-#define vmulx_lane_f64(__p0_283, __p1_283, __p2_283) __extension__ ({ \
-  float64x1_t __s0_283 = __p0_283; \
-  float64x1_t __s1_283 = __p1_283; \
-  float64x1_t __ret_283; \
-  float64_t __x_283 = __noswap_vget_lane_f64(__s0_283, 0); \
-  float64_t __y_283 = __noswap_vget_lane_f64(__s1_283, __p2_283); \
-  float64_t __z_283 = __noswap_vmulxd_f64(__x_283, __y_283); \
-  __ret_283 = __noswap_vset_lane_f64(__z_283, __s0_283, __p2_283); \
-  __ret_283; \
+#define vmulx_lane_f64(__p0_301, __p1_301, __p2_301) __extension__ ({ \
+  float64x1_t __s0_301 = __p0_301; \
+  float64x1_t __s1_301 = __p1_301; \
+  float64x1_t __ret_301; \
+  float64_t __x_301 = __noswap_vget_lane_f64(__s0_301, 0); \
+  float64_t __y_301 = __noswap_vget_lane_f64(__s1_301, __p2_301); \
+  float64_t __z_301 = __noswap_vmulxd_f64(__x_301, __y_301); \
+  __ret_301 = __noswap_vset_lane_f64(__z_301, __s0_301, __p2_301); \
+  __ret_301; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulx_laneq_f64(__p0_284, __p1_284, __p2_284) __extension__ ({ \
-  float64x1_t __s0_284 = __p0_284; \
-  float64x2_t __s1_284 = __p1_284; \
-  float64x1_t __ret_284; \
-  float64_t __x_284 = vget_lane_f64(__s0_284, 0); \
-  float64_t __y_284 = vgetq_lane_f64(__s1_284, __p2_284); \
-  float64_t __z_284 = vmulxd_f64(__x_284, __y_284); \
-  __ret_284 = vset_lane_f64(__z_284, __s0_284, 0); \
-  __ret_284; \
+#define vmulx_laneq_f64(__p0_302, __p1_302, __p2_302) __extension__ ({ \
+  float64x1_t __s0_302 = __p0_302; \
+  float64x2_t __s1_302 = __p1_302; \
+  float64x1_t __ret_302; \
+  float64_t __x_302 = vget_lane_f64(__s0_302, 0); \
+  float64_t __y_302 = vgetq_lane_f64(__s1_302, __p2_302); \
+  float64_t __z_302 = vmulxd_f64(__x_302, __y_302); \
+  __ret_302 = vset_lane_f64(__z_302, __s0_302, 0); \
+  __ret_302; \
 })
 #else
-#define vmulx_laneq_f64(__p0_285, __p1_285, __p2_285) __extension__ ({ \
-  float64x1_t __s0_285 = __p0_285; \
-  float64x2_t __s1_285 = __p1_285; \
-  float64x2_t __rev1_285;  __rev1_285 = __builtin_shufflevector(__s1_285, __s1_285, 1, 0); \
-  float64x1_t __ret_285; \
-  float64_t __x_285 = __noswap_vget_lane_f64(__s0_285, 0); \
-  float64_t __y_285 = __noswap_vgetq_lane_f64(__rev1_285, __p2_285); \
-  float64_t __z_285 = __noswap_vmulxd_f64(__x_285, __y_285); \
-  __ret_285 = __noswap_vset_lane_f64(__z_285, __s0_285, 0); \
-  __ret_285; \
+#define vmulx_laneq_f64(__p0_303, __p1_303, __p2_303) __extension__ ({ \
+  float64x1_t __s0_303 = __p0_303; \
+  float64x2_t __s1_303 = __p1_303; \
+  float64x2_t __rev1_303;  __rev1_303 = __builtin_shufflevector(__s1_303, __s1_303, 1, 0); \
+  float64x1_t __ret_303; \
+  float64_t __x_303 = __noswap_vget_lane_f64(__s0_303, 0); \
+  float64_t __y_303 = __noswap_vgetq_lane_f64(__rev1_303, __p2_303); \
+  float64_t __z_303 = __noswap_vmulxd_f64(__x_303, __y_303); \
+  __ret_303 = __noswap_vset_lane_f64(__z_303, __s0_303, 0); \
+  __ret_303; \
 })
 #endif
 
diff --git a/darwin-x86/clang-headers/armintr.h b/darwin-x86/clang-headers/armintr.h
new file mode 100644
index 0000000..933afcb
--- /dev/null
+++ b/darwin-x86/clang-headers/armintr.h
@@ -0,0 +1,45 @@
+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <armintr.h>
+#else
+
+#ifndef __ARMINTR_H
+#define __ARMINTR_H
+
+typedef enum
+{
+  _ARM_BARRIER_SY    = 0xF,
+  _ARM_BARRIER_ST    = 0xE,
+  _ARM_BARRIER_ISH   = 0xB,
+  _ARM_BARRIER_ISHST = 0xA,
+  _ARM_BARRIER_NSH   = 0x7,
+  _ARM_BARRIER_NSHST = 0x6,
+  _ARM_BARRIER_OSH   = 0x3,
+  _ARM_BARRIER_OSHST = 0x2
+} _ARMINTR_BARRIER_TYPE;
+
+#endif /* __ARMINTR_H */
+#endif /* _MSC_VER */
diff --git a/darwin-x86/clang-headers/avx2intrin.h b/darwin-x86/clang-headers/avx2intrin.h
index 13bcbef..9688a96 100644
--- a/darwin-x86/clang-headers/avx2intrin.h
+++ b/darwin-x86/clang-headers/avx2intrin.h
@@ -29,183 +29,176 @@
 #define __AVX2INTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2")))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
 
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 #define _mm256_mpsadbw_epu8(X, Y, M) \
   (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
                                      (__v32qi)(__m256i)(Y), (int)(M))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi8(__m256i __a)
 {
     return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi16(__m256i __a)
 {
     return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi32(__m256i __a)
 {
     return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_packs_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_packs_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_packus_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
 {
   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_add_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v32qu)__a + (__v32qu)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_add_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hu)__a + (__v16hu)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_add_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8su)__a + (__v8su)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_add_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4du)__a + (__v4du)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
 }
 
-#define _mm256_alignr_epi8(a, b, n) __extension__ ({        \
+#define _mm256_alignr_epi8(a, b, n) \
   (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
-                                     (__v32qi)(__m256i)(b), (n)); })
+                                     (__v32qi)(__m256i)(b), (n))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_and_si256(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4du)__a & (__v4du)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_andnot_si256(__m256i __a, __m256i __b)
 {
   return (__m256i)(~(__v4du)__a & (__v4du)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_avg_epu8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+  typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
+  return (__m256i)__builtin_convertvector(
+               ((__builtin_convertvector((__v32qu)__a, __v32hu) +
+                 __builtin_convertvector((__v32qu)__b, __v32hu)) + 1)
+                 >> 1, __v32qu);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_avg_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+  typedef unsigned int __v16su __attribute__((__vector_size__(64)));
+  return (__m256i)__builtin_convertvector(
+               ((__builtin_convertvector((__v16hu)__a, __v16su) +
+                 __builtin_convertvector((__v16hu)__b, __v16su)) + 1)
+                 >> 1, __v16hu);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 {
   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
                                               (__v32qi)__M);
 }
 
-#define _mm256_blend_epi16(V1, V2, M) __extension__ ({       \
-  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1),   \
-                                   (__v16hi)(__m256i)(V2),   \
-                                   (((M) & 0x01) ? 16 : 0),  \
-                                   (((M) & 0x02) ? 17 : 1),  \
-                                   (((M) & 0x04) ? 18 : 2),  \
-                                   (((M) & 0x08) ? 19 : 3),  \
-                                   (((M) & 0x10) ? 20 : 4),  \
-                                   (((M) & 0x20) ? 21 : 5),  \
-                                   (((M) & 0x40) ? 22 : 6),  \
-                                   (((M) & 0x80) ? 23 : 7),  \
-                                   (((M) & 0x01) ? 24 : 8),  \
-                                   (((M) & 0x02) ? 25 : 9),  \
-                                   (((M) & 0x04) ? 26 : 10), \
-                                   (((M) & 0x08) ? 27 : 11), \
-                                   (((M) & 0x10) ? 28 : 12), \
-                                   (((M) & 0x20) ? 29 : 13), \
-                                   (((M) & 0x40) ? 30 : 14), \
-                                   (((M) & 0x80) ? 31 : 15)); })
+#define _mm256_blend_epi16(V1, V2, M) \
+  (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+                                     (__v16hi)(__m256i)(V2), (int)(M))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v32qi)__a == (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a == (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a == (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4di)__a == (__v4di)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 {
   /* This function always performs a signed comparison, but __v32qi is a char
@@ -213,151 +206,151 @@
   return (__m256i)((__v32qs)__a > (__v32qs)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a > (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a > (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4di)__a > (__v4di)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_hadd_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_hadd_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_hadds_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_hsub_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_hsub_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_madd_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epu32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epu32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS256
 _mm256_movemask_epi8(__m256i __a)
 {
   return __builtin_ia32_pmovmskb256((__v32qi)__a);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi8_epi16(__m128i __V)
 {
   /* This function always performs a signed extension, but __v16qi is a char
@@ -365,7 +358,7 @@
   return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi8_epi32(__m128i __V)
 {
   /* This function always performs a signed extension, but __v16qi is a char
@@ -373,7 +366,7 @@
   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi8_epi64(__m128i __V)
 {
   /* This function always performs a signed extension, but __v16qi is a char
@@ -381,919 +374,795 @@
   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi16_epi32(__m128i __V)
 {
   return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi16_epi64(__m128i __V)
 {
   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi32_epi64(__m128i __V)
 {
   return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepu8_epi16(__m128i __V)
 {
   return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepu8_epi32(__m128i __V)
 {
   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepu8_epi64(__m128i __V)
 {
   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepu16_epi32(__m128i __V)
 {
   return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepu16_epi64(__m128i __V)
 {
   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtepu32_epi64(__m128i __V)
 {
   return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
 }
 
-static __inline__  __m256i __DEFAULT_FN_ATTRS
+static __inline__  __m256i __DEFAULT_FN_ATTRS256
 _mm256_mul_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mullo_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hu)__a * (__v16hu)__b);
 }
 
-static __inline__  __m256i __DEFAULT_FN_ATTRS
+static __inline__  __m256i __DEFAULT_FN_ATTRS256
 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8su)__a * (__v8su)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mul_epu32(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_or_si256(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4du)__a | (__v4du)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sad_epu8(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
 }
 
-#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
-  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
-                                   (__v8si)_mm256_undefined_si256(), \
-                                   0 + (((imm) >> 0) & 0x3), \
-                                   0 + (((imm) >> 2) & 0x3), \
-                                   0 + (((imm) >> 4) & 0x3), \
-                                   0 + (((imm) >> 6) & 0x3), \
-                                   4 + (((imm) >> 0) & 0x3), \
-                                   4 + (((imm) >> 2) & 0x3), \
-                                   4 + (((imm) >> 4) & 0x3), \
-                                   4 + (((imm) >> 6) & 0x3)); })
+#define _mm256_shuffle_epi32(a, imm) \
+  (__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))
 
-#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
-  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
-                                   (__v16hi)_mm256_undefined_si256(), \
-                                   0, 1, 2, 3, \
-                                   4  + (((imm) >> 0) & 0x3), \
-                                   4  + (((imm) >> 2) & 0x3), \
-                                   4  + (((imm) >> 4) & 0x3), \
-                                   4  + (((imm) >> 6) & 0x3), \
-                                   8, 9, 10, 11, \
-                                   12 + (((imm) >> 0) & 0x3), \
-                                   12 + (((imm) >> 2) & 0x3), \
-                                   12 + (((imm) >> 4) & 0x3), \
-                                   12 + (((imm) >> 6) & 0x3)); })
+#define _mm256_shufflehi_epi16(a, imm) \
+  (__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))
 
-#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
-  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
-                                   (__v16hi)_mm256_undefined_si256(), \
-                                   0 + (((imm) >> 0) & 0x3), \
-                                   0 + (((imm) >> 2) & 0x3), \
-                                   0 + (((imm) >> 4) & 0x3), \
-                                   0 + (((imm) >> 6) & 0x3), \
-                                   4, 5, 6, 7, \
-                                   8 + (((imm) >> 0) & 0x3), \
-                                   8 + (((imm) >> 2) & 0x3), \
-                                   8 + (((imm) >> 4) & 0x3), \
-                                   8 + (((imm) >> 6) & 0x3), \
-                                   12, 13, 14, 15); })
+#define _mm256_shufflelo_epi16(a, imm) \
+  (__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sign_epi8(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sign_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sign_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
 }
 
-#define _mm256_slli_si256(a, imm) __extension__ ({ \
-  (__m256i)__builtin_shufflevector(                                          \
-        (__v32qi)_mm256_setzero_si256(),                                     \
-        (__v32qi)(__m256i)(a),                                               \
-        ((char)(imm)&0xF0) ?  0 : ((char)(imm)>0x0 ? 16 : 32) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  1 : ((char)(imm)>0x1 ? 17 : 33) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  2 : ((char)(imm)>0x2 ? 18 : 34) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  3 : ((char)(imm)>0x3 ? 19 : 35) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  4 : ((char)(imm)>0x4 ? 20 : 36) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  5 : ((char)(imm)>0x5 ? 21 : 37) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  6 : ((char)(imm)>0x6 ? 22 : 38) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  7 : ((char)(imm)>0x7 ? 23 : 39) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  8 : ((char)(imm)>0x8 ? 24 : 40) - (char)(imm), \
-        ((char)(imm)&0xF0) ?  9 : ((char)(imm)>0x9 ? 25 : 41) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 : 42) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 : 43) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 : 44) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 : 45) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 : 46) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 : 47) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 : 48) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 : 49) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 : 50) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 : 51) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 : 52) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 : 53) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 : 54) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 : 55) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 : 56) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 : 57) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 : 58) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 : 59) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \
-        ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 63) - (char)(imm)); })
+#define _mm256_slli_si256(a, imm) \
+  (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
 
-#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+#define _mm256_bslli_epi128(a, imm) \
+  (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_slli_epi16(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sll_epi16(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_slli_epi32(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sll_epi32(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_slli_epi64(__m256i __a, int __count)
 {
   return __builtin_ia32_psllqi256((__v4di)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sll_epi64(__m256i __a, __m128i __count)
 {
   return __builtin_ia32_psllq256((__v4di)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srai_epi16(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sra_epi16(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srai_epi32(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sra_epi32(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
 }
 
-#define _mm256_srli_si256(a, imm) __extension__ ({ \
-  (__m256i)__builtin_shufflevector(                                           \
-        (__v32qi)(__m256i)(a),                                               \
-        (__v32qi)_mm256_setzero_si256(),                                     \
-        ((char)(imm)&0xF0) ? 32 : (char)(imm) + ((char)(imm)>0xF ? 16 : 0),  \
-        ((char)(imm)&0xF0) ? 33 : (char)(imm) + ((char)(imm)>0xE ? 17 : 1),  \
-        ((char)(imm)&0xF0) ? 34 : (char)(imm) + ((char)(imm)>0xD ? 18 : 2),  \
-        ((char)(imm)&0xF0) ? 35 : (char)(imm) + ((char)(imm)>0xC ? 19 : 3),  \
-        ((char)(imm)&0xF0) ? 36 : (char)(imm) + ((char)(imm)>0xB ? 20 : 4),  \
-        ((char)(imm)&0xF0) ? 37 : (char)(imm) + ((char)(imm)>0xA ? 21 : 5),  \
-        ((char)(imm)&0xF0) ? 38 : (char)(imm) + ((char)(imm)>0x9 ? 22 : 6),  \
-        ((char)(imm)&0xF0) ? 39 : (char)(imm) + ((char)(imm)>0x8 ? 23 : 7),  \
-        ((char)(imm)&0xF0) ? 40 : (char)(imm) + ((char)(imm)>0x7 ? 24 : 8),  \
-        ((char)(imm)&0xF0) ? 41 : (char)(imm) + ((char)(imm)>0x6 ? 25 : 9),  \
-        ((char)(imm)&0xF0) ? 42 : (char)(imm) + ((char)(imm)>0x5 ? 26 : 10), \
-        ((char)(imm)&0xF0) ? 43 : (char)(imm) + ((char)(imm)>0x4 ? 27 : 11), \
-        ((char)(imm)&0xF0) ? 44 : (char)(imm) + ((char)(imm)>0x3 ? 28 : 12), \
-        ((char)(imm)&0xF0) ? 45 : (char)(imm) + ((char)(imm)>0x2 ? 29 : 13), \
-        ((char)(imm)&0xF0) ? 46 : (char)(imm) + ((char)(imm)>0x1 ? 30 : 14), \
-        ((char)(imm)&0xF0) ? 47 : (char)(imm) + ((char)(imm)>0x0 ? 31 : 15), \
-        ((char)(imm)&0xF0) ? 48 : (char)(imm) + ((char)(imm)>0xF ? 32 : 16), \
-        ((char)(imm)&0xF0) ? 49 : (char)(imm) + ((char)(imm)>0xE ? 33 : 17), \
-        ((char)(imm)&0xF0) ? 50 : (char)(imm) + ((char)(imm)>0xD ? 34 : 18), \
-        ((char)(imm)&0xF0) ? 51 : (char)(imm) + ((char)(imm)>0xC ? 35 : 19), \
-        ((char)(imm)&0xF0) ? 52 : (char)(imm) + ((char)(imm)>0xB ? 36 : 20), \
-        ((char)(imm)&0xF0) ? 53 : (char)(imm) + ((char)(imm)>0xA ? 37 : 21), \
-        ((char)(imm)&0xF0) ? 54 : (char)(imm) + ((char)(imm)>0x9 ? 38 : 22), \
-        ((char)(imm)&0xF0) ? 55 : (char)(imm) + ((char)(imm)>0x8 ? 39 : 23), \
-        ((char)(imm)&0xF0) ? 56 : (char)(imm) + ((char)(imm)>0x7 ? 40 : 24), \
-        ((char)(imm)&0xF0) ? 57 : (char)(imm) + ((char)(imm)>0x6 ? 41 : 25), \
-        ((char)(imm)&0xF0) ? 58 : (char)(imm) + ((char)(imm)>0x5 ? 42 : 26), \
-        ((char)(imm)&0xF0) ? 59 : (char)(imm) + ((char)(imm)>0x4 ? 43 : 27), \
-        ((char)(imm)&0xF0) ? 60 : (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \
-        ((char)(imm)&0xF0) ? 61 : (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \
-        ((char)(imm)&0xF0) ? 62 : (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \
-        ((char)(imm)&0xF0) ? 63 : (char)(imm) + ((char)(imm)>0x0 ? 47 : 31)); })
+#define _mm256_srli_si256(a, imm) \
+  (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
 
-#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+#define _mm256_bsrli_epi128(a, imm) \
+  (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srli_epi16(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srl_epi16(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srli_epi32(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srl_epi32(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srli_epi64(__m256i __a, int __count)
 {
   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srl_epi64(__m256i __a, __m128i __count)
 {
   return __builtin_ia32_psrlq256((__v4di)__a, __count);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sub_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v32qu)__a - (__v32qu)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sub_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hu)__a - (__v16hu)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sub_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8su)__a - (__v8su)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sub_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4du)__a - (__v4du)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_xor_si256(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4du)__a ^ (__v4du)__b);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_stream_load_si256(__m256i const *__V)
 {
-  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
+  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
+  return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_broadcastss_ps(__m128 __X)
 {
   return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_broadcastsd_pd(__m128d __a)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_broadcastss_ps(__m128 __X)
 {
   return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_broadcastsd_pd(__m128d __X)
 {
   return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastsi128_si256(__m128i __X)
 {
   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
 }
 
-#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
-  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1),  \
-                                   (__v4si)(__m128i)(V2),  \
-                                   (((M) & 0x01) ? 4 : 0), \
-                                   (((M) & 0x02) ? 5 : 1), \
-                                   (((M) & 0x04) ? 6 : 2), \
-                                   (((M) & 0x08) ? 7 : 3)); })
+#define _mm_blend_epi32(V1, V2, M) \
+  (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+                                     (__v4si)(__m128i)(V2), (int)(M))
 
-#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
-  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1),   \
-                                   (__v8si)(__m256i)(V2),   \
-                                   (((M) & 0x01) ?  8 : 0), \
-                                   (((M) & 0x02) ?  9 : 1), \
-                                   (((M) & 0x04) ? 10 : 2), \
-                                   (((M) & 0x08) ? 11 : 3), \
-                                   (((M) & 0x10) ? 12 : 4), \
-                                   (((M) & 0x20) ? 13 : 5), \
-                                   (((M) & 0x40) ? 14 : 6), \
-                                   (((M) & 0x80) ? 15 : 7)); })
+#define _mm256_blend_epi32(V1, V2, M) \
+  (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+                                     (__v8si)(__m256i)(V2), (int)(M))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastb_epi8(__m128i __X)
 {
   return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastw_epi16(__m128i __X)
 {
   return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastd_epi32(__m128i __X)
 {
   return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastq_epi64(__m128i __X)
 {
   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastb_epi8(__m128i __X)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastw_epi16(__m128i __X)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastd_epi32(__m128i __X)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastq_epi64(__m128i __X)
 {
   return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
 }
 
-#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
-                                   (__v4df)_mm256_undefined_pd(), \
-                                   ((M) >> 0) & 0x3, \
-                                   ((M) >> 2) & 0x3, \
-                                   ((M) >> 4) & 0x3, \
-                                   ((M) >> 6) & 0x3); })
+#define _mm256_permute4x64_pd(V, M) \
+  (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 {
   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
 }
 
-#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
-  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
-                                   (__v4di)_mm256_undefined_si256(), \
-                                   ((M) >> 0) & 0x3, \
-                                   ((M) >> 2) & 0x3, \
-                                   ((M) >> 4) & 0x3, \
-                                   ((M) >> 6) & 0x3); })
+#define _mm256_permute4x64_epi64(V, M) \
+  (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))
 
-#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
-  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })
+#define _mm256_permute2x128_si256(V1, V2, M) \
+  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))
 
-#define _mm256_extracti128_si256(V, M) __extension__ ({ \
-  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
-                                   (__v4di)_mm256_undefined_si256(), \
-                                   (((M) & 1) ? 2 : 0), \
-                                   (((M) & 1) ? 3 : 1) ); })
+#define _mm256_extracti128_si256(V, M) \
+  (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
 
-#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
-  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
-                                   (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
-                                   (((M) & 1) ? 0 : 4), \
-                                   (((M) & 1) ? 1 : 5), \
-                                   (((M) & 1) ? 4 : 2), \
-                                   (((M) & 1) ? 5 : 3) ); })
+#define _mm256_inserti128_si256(V1, V2, M) \
+  (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
+                                        (__v2di)(__m128i)(V2), (int)(M))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskload_epi32(int const *__X, __m256i __M)
 {
   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskload_epi64(long long const *__X, __m256i __M)
 {
   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskload_epi32(int const *__X, __m128i __M)
 {
   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskload_epi64(long long const *__X, __m128i __M)
 {
   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
 {
   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
 {
   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
 {
   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
 {
   __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_sllv_epi32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_sllv_epi64(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srav_epi32(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_srav_epi32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_srlv_epi32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_srlv_epi64(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
 }
 
-#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
   (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
                                      (double const *)(m), \
                                      (__v4si)(__m128i)(i), \
-                                     (__v2df)(__m128d)(mask), (s)); })
+                                     (__v2df)(__m128d)(mask), (s))
 
-#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
   (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
                                         (double const *)(m), \
                                         (__v4si)(__m128i)(i), \
-                                        (__v4df)(__m256d)(mask), (s)); })
+                                        (__v4df)(__m256d)(mask), (s))
 
-#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
   (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
                                      (double const *)(m), \
                                      (__v2di)(__m128i)(i), \
-                                     (__v2df)(__m128d)(mask), (s)); })
+                                     (__v2df)(__m128d)(mask), (s))
 
-#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
   (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
                                         (double const *)(m), \
                                         (__v4di)(__m256i)(i), \
-                                        (__v4df)(__m256d)(mask), (s)); })
+                                        (__v4df)(__m256d)(mask), (s))
 
-#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
   (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
                                     (float const *)(m), \
                                     (__v4si)(__m128i)(i), \
-                                    (__v4sf)(__m128)(mask), (s)); })
+                                    (__v4sf)(__m128)(mask), (s))
 
-#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
   (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
                                        (float const *)(m), \
                                        (__v8si)(__m256i)(i), \
-                                       (__v8sf)(__m256)(mask), (s)); })
+                                       (__v8sf)(__m256)(mask), (s))
 
-#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
   (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
                                     (float const *)(m), \
                                     (__v2di)(__m128i)(i), \
-                                    (__v4sf)(__m128)(mask), (s)); })
+                                    (__v4sf)(__m128)(mask), (s))
 
-#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
   (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
                                        (float const *)(m), \
                                        (__v4di)(__m256i)(i), \
-                                       (__v4sf)(__m128)(mask), (s)); })
+                                       (__v4sf)(__m128)(mask), (s))
 
-#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
   (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
                                     (int const *)(m), \
                                     (__v4si)(__m128i)(i), \
-                                    (__v4si)(__m128i)(mask), (s)); })
+                                    (__v4si)(__m128i)(mask), (s))
 
-#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
   (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
                                        (int const *)(m), \
                                        (__v8si)(__m256i)(i), \
-                                       (__v8si)(__m256i)(mask), (s)); })
+                                       (__v8si)(__m256i)(mask), (s))
 
-#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
   (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
                                     (int const *)(m), \
                                     (__v2di)(__m128i)(i), \
-                                    (__v4si)(__m128i)(mask), (s)); })
+                                    (__v4si)(__m128i)(mask), (s))
 
-#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
   (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
                                        (int const *)(m), \
                                        (__v4di)(__m256i)(i), \
-                                       (__v4si)(__m128i)(mask), (s)); })
+                                       (__v4si)(__m128i)(mask), (s))
 
-#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
   (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
                                     (long long const *)(m), \
                                     (__v4si)(__m128i)(i), \
-                                    (__v2di)(__m128i)(mask), (s)); })
+                                    (__v2di)(__m128i)(mask), (s))
 
-#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
   (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
                                        (long long const *)(m), \
                                        (__v4si)(__m128i)(i), \
-                                       (__v4di)(__m256i)(mask), (s)); })
+                                       (__v4di)(__m256i)(mask), (s))
 
-#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
   (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
                                     (long long const *)(m), \
                                     (__v2di)(__m128i)(i), \
-                                    (__v2di)(__m128i)(mask), (s)); })
+                                    (__v2di)(__m128i)(mask), (s))
 
-#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
   (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
                                        (long long const *)(m), \
                                        (__v4di)(__m256i)(i), \
-                                       (__v4di)(__m256i)(mask), (s)); })
+                                       (__v4di)(__m256i)(mask), (s))
 
-#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
+#define _mm_i32gather_pd(m, i, s) \
   (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
                                      (double const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
                                                           _mm_setzero_pd()), \
-                                     (s)); })
+                                     (s))
 
-#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
+#define _mm256_i32gather_pd(m, i, s) \
   (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
                                         (double const *)(m), \
                                         (__v4si)(__m128i)(i), \
                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
                                                               _mm256_setzero_pd(), \
                                                               _CMP_EQ_OQ), \
-                                        (s)); })
+                                        (s))
 
-#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
+#define _mm_i64gather_pd(m, i, s) \
   (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
                                      (double const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
                                                           _mm_setzero_pd()), \
-                                     (s)); })
+                                     (s))
 
-#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
+#define _mm256_i64gather_pd(m, i, s) \
   (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
                                         (double const *)(m), \
                                         (__v4di)(__m256i)(i), \
                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
                                                               _mm256_setzero_pd(), \
                                                               _CMP_EQ_OQ), \
-                                        (s)); })
+                                        (s))
 
-#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
+#define _mm_i32gather_ps(m, i, s) \
   (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
                                     (float const *)(m), \
                                     (__v4si)(__m128i)(i), \
                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
                                                          _mm_setzero_ps()), \
-                                    (s)); })
+                                    (s))
 
-#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
+#define _mm256_i32gather_ps(m, i, s) \
   (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
                                        (float const *)(m), \
                                        (__v8si)(__m256i)(i), \
                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
                                                              _mm256_setzero_ps(), \
                                                              _CMP_EQ_OQ), \
-                                       (s)); })
+                                       (s))
 
-#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
+#define _mm_i64gather_ps(m, i, s) \
   (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
                                     (float const *)(m), \
                                     (__v2di)(__m128i)(i), \
                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
                                                          _mm_setzero_ps()), \
-                                    (s)); })
+                                    (s))
 
-#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
+#define _mm256_i64gather_ps(m, i, s) \
   (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
                                        (float const *)(m), \
                                        (__v4di)(__m256i)(i), \
                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
                                                             _mm_setzero_ps()), \
-                                       (s)); })
+                                       (s))
 
-#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
+#define _mm_i32gather_epi32(m, i, s) \
   (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
                                     (int const *)(m), (__v4si)(__m128i)(i), \
-                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+                                    (__v4si)_mm_set1_epi32(-1), (s))
 
-#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
+#define _mm256_i32gather_epi32(m, i, s) \
   (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
                                        (int const *)(m), (__v8si)(__m256i)(i), \
-                                       (__v8si)_mm256_set1_epi32(-1), (s)); })
+                                       (__v8si)_mm256_set1_epi32(-1), (s))
 
-#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
+#define _mm_i64gather_epi32(m, i, s) \
   (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
                                     (int const *)(m), (__v2di)(__m128i)(i), \
-                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+                                    (__v4si)_mm_set1_epi32(-1), (s))
 
-#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
+#define _mm256_i64gather_epi32(m, i, s) \
   (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
                                        (int const *)(m), (__v4di)(__m256i)(i), \
-                                       (__v4si)_mm_set1_epi32(-1), (s)); })
+                                       (__v4si)_mm_set1_epi32(-1), (s))
 
-#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
+#define _mm_i32gather_epi64(m, i, s) \
   (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
                                     (long long const *)(m), \
                                     (__v4si)(__m128i)(i), \
-                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+                                    (__v2di)_mm_set1_epi64x(-1), (s))
 
-#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
+#define _mm256_i32gather_epi64(m, i, s) \
   (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
                                        (long long const *)(m), \
                                        (__v4si)(__m128i)(i), \
-                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+                                       (__v4di)_mm256_set1_epi64x(-1), (s))
 
-#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
+#define _mm_i64gather_epi64(m, i, s) \
   (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
                                     (long long const *)(m), \
                                     (__v2di)(__m128i)(i), \
-                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+                                    (__v2di)_mm_set1_epi64x(-1), (s))
 
-#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
+#define _mm256_i64gather_epi64(m, i, s) \
   (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
                                        (long long const *)(m), \
                                        (__v4di)(__m256i)(i), \
-                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+                                       (__v4di)_mm256_set1_epi64x(-1), (s))
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128
 
 #endif /* __AVX2INTRIN_H */
diff --git a/darwin-x86/clang-headers/avx512bitalgintrin.h b/darwin-x86/clang-headers/avx512bitalgintrin.h
new file mode 100644
index 0000000..56046f8
--- /dev/null
+++ b/darwin-x86/clang-headers/avx512bitalgintrin.h
@@ -0,0 +1,97 @@
+/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BITALGINTRIN_H
+#define __AVX512BITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi16(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
+              (__v32hi) _mm512_popcnt_epi16(__B),
+              (__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
+{
+  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
+              __U,
+              __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi8(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
+              (__v64qi) _mm512_popcnt_epi8(__B),
+              (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
+{
+  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
+              __U,
+              __B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
+              (__v64qi) __B,
+              __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
+{
+  return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
+              __A,
+              __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/darwin-x86/clang-headers/avx512bwintrin.h b/darwin-x86/clang-headers/avx512bwintrin.h
index d3c5a6c..fc46323 100644
--- a/darwin-x86/clang-headers/avx512bwintrin.h
+++ b/darwin-x86/clang-headers/avx512bwintrin.h
@@ -32,317 +32,149 @@
 typedef unsigned long long __mmask64;
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
-
-static  __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_setzero_qi(void) {
-  return (__m512i)(__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0 };
-}
-
-static  __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_setzero_hi(void) {
-  return (__m512i)(__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0 };
-}
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512)))
 
 /* Integer compare */
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
-                                                   (__mmask64)-1);
-}
+#define _mm512_cmp_epi8_mask(a, b, p) \
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)-1)
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
-                                                   __u);
-}
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)(m))
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
-                                                 (__mmask64)-1);
-}
+#define _mm512_cmp_epu8_mask(a, b, p) \
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)-1)
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
-                                                 __u);
-}
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)(m))
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
-                                                   (__mmask32)-1);
-}
+#define _mm512_cmp_epi16_mask(a, b, p) \
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)-1)
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
-                                                   __u);
-}
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)(m))
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
-                                                 (__mmask32)-1);
-}
+#define _mm512_cmp_epu16_mask(a, b, p) \
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)-1)
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
-                                                 __u);
-}
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)(m))
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
-                                                (__mmask64)-1);
-}
+#define _mm512_cmpeq_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
-                                                __u);
-}
+#define _mm512_cmpeq_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
-                                                 (__mmask64)-1);
-}
+#define _mm512_cmpeq_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
-                                                 __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
-                                                   (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
-                                                   __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
-                                                 (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
-                                                   (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
-                                                   __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
-                                                 __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
-                                                (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
-                                                 (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
-                                                 __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
-                                                (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
-                                                 (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
-                                                 __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
-                                                (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
-                                                 (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
-                                                 __u);
-}
+#define _mm512_cmpeq_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi8 (__m512i __A, __m512i __B) {
@@ -350,19 +182,17 @@
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
-             (__v64qi) __B,
-             (__v64qi) __W,
-             (__mmask64) __U);
+_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_add_epi8(__A, __B),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
-             (__v64qi) __B,
-             (__v64qi) _mm512_setzero_qi(),
-             (__mmask64) __U);
+_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_add_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -371,19 +201,17 @@
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
-             (__v64qi) __B,
-             (__v64qi) __W,
-             (__mmask64) __U);
+_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_sub_epi8(__A, __B),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
-             (__v64qi) __B,
-             (__v64qi) _mm512_setzero_qi(),
-             (__mmask64) __U);
+_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_sub_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -392,19 +220,17 @@
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
-             (__v32hi) __B,
-             (__v32hi) __W,
-             (__mmask32) __U);
+_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_add_epi16(__A, __B),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
-             (__v32hi) __B,
-             (__v32hi) _mm512_setzero_hi(),
-             (__mmask32) __U);
+_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_add_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -413,19 +239,17 @@
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
-             (__v32hi) __B,
-             (__v32hi) __W,
-             (__mmask32) __U);
+_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_sub_epi16(__A, __B),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
-             (__v32hi) __B,
-             (__v32hi) _mm512_setzero_hi(),
-             (__mmask32) __U);
+_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_sub_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -434,19 +258,17 @@
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __U);
+_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -468,161 +290,133 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_abs_epi8 (__m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
-              (__v64qi) __W,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_abs_epi8(__A),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_abs_epi8(__A),
+                                             (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_abs_epi16 (__m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_abs_epi16(__A),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_abs_epi16(__A),
+                                             (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_packs_epi32 (__m512i __A, __m512i __B)
+_mm512_packs_epi32(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                       (__v32hi)_mm512_packs_epi32(__A, __B),
+                                       (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
-       __m512i __B)
+_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) __W,
-              __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                       (__v32hi)_mm512_packs_epi32(__A, __B),
+                                       (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_packs_epi16 (__m512i __A, __m512i __B)
+_mm512_packs_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
-       __m512i __B)
+_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) __W,
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                        (__v64qi)_mm512_packs_epi16(__A, __B),
+                                        (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                        (__v64qi)_mm512_packs_epi16(__A, __B),
+                                        (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_packus_epi32 (__m512i __A, __m512i __B)
+_mm512_packus_epi32(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                       (__v32hi)_mm512_packus_epi32(__A, __B),
+                                       (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
-        __m512i __B)
+_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) __W,
-              __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                       (__v32hi)_mm512_packus_epi32(__A, __B),
+                                       (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_packus_epi16 (__m512i __A, __m512i __B)
+_mm512_packus_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
-        __m512i __B)
+_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) __W,
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                        (__v64qi)_mm512_packus_epi16(__A, __B),
+                                        (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                        (__v64qi)_mm512_packus_epi16(__A, __B),
+                                        (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -630,7 +424,7 @@
 {
   return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
               (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
               (__mmask64) -1);
 }
 
@@ -649,7 +443,7 @@
 {
   return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
               (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
               (__mmask64) __U);
 }
 
@@ -658,7 +452,7 @@
 {
   return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
               (__mmask32) -1);
 }
 
@@ -677,7 +471,7 @@
 {
   return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
               (__mmask32) __U);
 }
 
@@ -686,7 +480,7 @@
 {
   return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
               (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
               (__mmask64) -1);
 }
 
@@ -705,7 +499,7 @@
 {
   return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
               (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
               (__mmask64) __U);
 }
 
@@ -714,7 +508,7 @@
 {
   return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
               (__mmask32) -1);
 }
 
@@ -733,316 +527,261 @@
 {
   return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
               (__mmask32) __U);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_avg_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  typedef unsigned short __v64hu __attribute__((__vector_size__(128)));
+  return (__m512i)__builtin_convertvector(
+              ((__builtin_convertvector((__v64qu) __A, __v64hu) +
+                __builtin_convertvector((__v64qu) __B, __v64hu)) + 1)
+                >> 1, __v64qu);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
           __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) __W,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+              (__v64qi)_mm512_avg_epu8(__A, __B),
+              (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+              (__v64qi)_mm512_avg_epu8(__A, __B),
+              (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_avg_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  typedef unsigned int __v32su __attribute__((__vector_size__(128)));
+  return (__m512i)__builtin_convertvector(
+              ((__builtin_convertvector((__v32hu) __A, __v32su) +
+                __builtin_convertvector((__v32hu) __B, __v32su)) + 1)
+                >> 1, __v32hu);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
            __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+              (__v32hi)_mm512_avg_epu16(__A, __B),
+              (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+              (__v32hi)_mm512_avg_epu16(__A, __B),
+              (__v32hi) _mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                             (__v64qi)_mm512_max_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
-          __m512i __B)
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) __W,
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                             (__v64qi)_mm512_max_epi8(__A, __B),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                            (__v32hi)_mm512_max_epi16(__A, __B),
+                                            (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
            __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                            (__v32hi)_mm512_max_epi16(__A, __B),
+                                            (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                             (__v64qi)_mm512_max_epu8(__A, __B),
+                                             (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
-          __m512i __B)
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) __W,
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                             (__v64qi)_mm512_max_epu8(__A, __B),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                            (__v32hi)_mm512_max_epu16(__A, __B),
+                                            (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
-           __m512i __B)
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                            (__v32hi)_mm512_max_epu16(__A, __B),
+                                            (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                             (__v64qi)_mm512_min_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
-          __m512i __B)
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) __W,
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                             (__v64qi)_mm512_min_epi8(__A, __B),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                            (__v32hi)_mm512_min_epi16(__A, __B),
+                                            (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
-           __m512i __B)
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                            (__v32hi)_mm512_min_epi16(__A, __B),
+                                            (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                             (__v64qi)_mm512_min_epu8(__A, __B),
+                                             (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
-          __m512i __B)
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) __W,
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                             (__v64qi)_mm512_min_epu8(__A, __B),
+                                             (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                            (__v32hi)_mm512_min_epu16(__A, __B),
+                                            (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
-           __m512i __B)
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                            (__v32hi)_mm512_min_epu16(__A, __B),
+                                            (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
+_mm512_shuffle_epi8(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
-        __m512i __B)
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) __W,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
+                                         (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
+                                         (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1050,7 +789,7 @@
 {
   return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
               (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
               (__mmask64) -1);
 }
 
@@ -1069,7 +808,7 @@
 {
   return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
               (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
               (__mmask64) __U);
 }
 
@@ -1078,7 +817,7 @@
 {
   return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
               (__mmask32) -1);
 }
 
@@ -1097,7 +836,7 @@
 {
   return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
               (__mmask32) __U);
 }
 
@@ -1106,7 +845,7 @@
 {
   return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
               (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
               (__mmask64) -1);
 }
 
@@ -1125,7 +864,7 @@
 {
   return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
               (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
               (__mmask64) __U);
 }
 
@@ -1134,7 +873,7 @@
 {
   return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
               (__mmask32) -1);
 }
 
@@ -1153,182 +892,148 @@
 {
   return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
               (__mmask32) __U);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
-         __mmask32 __U, __m512i __B)
+_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
-              (__v32hi) __I /* idx */ ,
-              (__v32hi) __B,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
+                                                 (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
+_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I,
+                               __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
-              (__v32hi) __A,
-              (__v32hi) __B,
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_selectw_512(__U,
+                              (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
+                              (__v32hi)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
-        __m512i __I, __m512i __B)
+_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U,
+                                __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
-              (__v32hi) __A,
-              (__v32hi) __B,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512(__U,
+                              (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
+                              (__v32hi)__I);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
-         __m512i __I, __m512i __B)
+_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
+                                __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
-              /* idx */ ,
-              (__v32hi) __A,
-              (__v32hi) __B,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512(__U,
+                              (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
+                              (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
+_mm512_mulhrs_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
-                (__v32hi) __B,
-                (__v32hi) _mm512_setzero_hi(),
-                (__mmask32) -1);
+  return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-        __m512i __B)
+_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
-                (__v32hi) __B,
-                (__v32hi) __W,
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_mulhrs_epi16(__A, __B),
+                                         (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
-                (__v32hi) __B,
-                (__v32hi) _mm512_setzero_hi(),
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_mulhrs_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
+_mm512_mulhi_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,
        __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_mulhi_epi16(__A, __B),
+                                          (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_mulhi_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
+_mm512_mulhi_epu16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
-               (__v32hi) __B,
-               (__v32hi) _mm512_setzero_hi(),
-               (__mmask32) -1);
+  return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
-       __m512i __B)
+_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
-               (__v32hi) __B,
-               (__v32hi) __W,
-               (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_mulhi_epu16(__A, __B),
+                                          (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
-               (__v32hi) __B,
-               (__v32hi) _mm512_setzero_hi(),
-               (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_mulhi_epu16(__A, __B),
+                                          (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) {
-  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
-                 (__v64qi) __Y,
-                 (__v32hi) _mm512_setzero_hi(),
-                 (__mmask32) -1);
+_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
+  return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
-         __m512i __Y) {
-  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
-                 (__v64qi) __Y,
-                 (__v32hi) __W,
-                 (__mmask32) __U);
+_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
+                          __m512i __Y) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
+                                        (__v32hi)_mm512_maddubs_epi16(__X, __Y),
+                                        (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) {
-  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
-                 (__v64qi) __Y,
-                 (__v32hi) _mm512_setzero_hi(),
-                 (__mmask32) __U);
+_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
+                                        (__v32hi)_mm512_maddubs_epi16(__X, __Y),
+                                        (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
-               (__v32hi) __B,
-               (__v16si) _mm512_setzero_si512(),
-               (__mmask16) -1);
+_mm512_madd_epi16(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
-      __m512i __B) {
-  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
-               (__v32hi) __B,
-               (__v16si) __W,
-               (__mmask16) __U);
+_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_madd_epi16(__A, __B),
+                                           (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
-               (__v32hi) __B,
-               (__v16si) _mm512_setzero_si512(),
-               (__mmask16) __U);
+_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_madd_epi16(__A, __B),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -1376,7 +1081,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm512_cvtepi16_epi8 (__m512i __A) {
   return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
-              (__v32qi) _mm256_setzero_si256(),
+              (__v32qi) _mm256_undefined_si256(),
               (__mmask32) -1);
 }
 
@@ -1444,7 +1149,7 @@
 _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpackhi_epi8(__A, __B),
-                                        (__v64qi)_mm512_setzero_qi());
+                                        (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1471,7 +1176,7 @@
 _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpackhi_epi16(__A, __B),
-                                       (__v32hi)_mm512_setzero_hi());
+                                       (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1506,7 +1211,7 @@
 _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpacklo_epi8(__A, __B),
-                                        (__v64qi)_mm512_setzero_qi());
+                                        (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1533,539 +1238,291 @@
 _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpacklo_epi16(__A, __B),
-                                       (__v32hi)_mm512_setzero_hi());
+                                       (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi16 (__m256i __A)
+_mm512_cvtepi8_epi16(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
-                (__v32hi)
-                _mm512_setzero_hi (),
-                (__mmask32) -1);
+  /* This function always performs a signed extension, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
-                (__v32hi) __W,
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A)
+_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
-                (__v32hi)
-                _mm512_setzero_hi(),
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
+                                             (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi16 (__m256i __A)
+_mm512_cvtepu8_epi16(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
-                (__v32hi)
-                _mm512_setzero_hi (),
-                (__mmask32) -1);
+  return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
-                (__v32hi) __W,
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
+                                             (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
+_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
-                (__v32hi)
-                _mm512_setzero_hi(),
-                (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
+                                             (__v32hi)_mm512_setzero_si512());
 }
 
 
-#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
-  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                         (__v64qi)(__m512i)(b), (int)(p), \
-                                         (__mmask64)-1); })
+#define _mm512_shufflehi_epi16(A, imm) \
+  (__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))
 
-#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                         (__v64qi)(__m512i)(b), (int)(p), \
-                                         (__mmask64)(m)); })
-
-#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
-  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)-1); })
-
-#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)(m)); })
-
-#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                         (__v32hi)(__m512i)(b), (int)(p), \
-                                         (__mmask32)-1); })
-
-#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                         (__v32hi)(__m512i)(b), (int)(p), \
-                                         (__mmask32)(m)); })
-
-#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)-1); })
-
-#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)(m)); })
-
-#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
-  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
-                                   (__v32hi)_mm512_undefined_epi32(), \
-                                   0, 1, 2, 3, \
-                                   4  + (((imm) >> 0) & 0x3), \
-                                   4  + (((imm) >> 2) & 0x3), \
-                                   4  + (((imm) >> 4) & 0x3), \
-                                   4  + (((imm) >> 6) & 0x3), \
-                                   8, 9, 10, 11, \
-                                   12 + (((imm) >> 0) & 0x3), \
-                                   12 + (((imm) >> 2) & 0x3), \
-                                   12 + (((imm) >> 4) & 0x3), \
-                                   12 + (((imm) >> 6) & 0x3), \
-                                   16, 17, 18, 19, \
-                                   20 + (((imm) >> 0) & 0x3), \
-                                   20 + (((imm) >> 2) & 0x3), \
-                                   20 + (((imm) >> 4) & 0x3), \
-                                   20 + (((imm) >> 6) & 0x3), \
-                                   24, 25, 26, 27, \
-                                   28 + (((imm) >> 0) & 0x3), \
-                                   28 + (((imm) >> 2) & 0x3), \
-                                   28 + (((imm) >> 4) & 0x3), \
-                                   28 + (((imm) >> 6) & 0x3)); })
-
-#define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
   (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                       (__v32hi)_mm512_shufflehi_epi16((A), \
                                                                       (imm)), \
-                                      (__v32hi)(__m512i)(W)); })
+                                      (__v32hi)(__m512i)(W))
 
-#define _mm512_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+#define _mm512_maskz_shufflehi_epi16(U, A, imm) \
   (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                       (__v32hi)_mm512_shufflehi_epi16((A), \
                                                                       (imm)), \
-                                      (__v32hi)_mm512_setzero_hi()); })
+                                      (__v32hi)_mm512_setzero_si512())
 
-#define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \
-  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
-                                   (__v32hi)_mm512_undefined_epi32(), \
-                                   0 + (((imm) >> 0) & 0x3), \
-                                   0 + (((imm) >> 2) & 0x3), \
-                                   0 + (((imm) >> 4) & 0x3), \
-                                   0 + (((imm) >> 6) & 0x3), \
-                                   4, 5, 6, 7, \
-                                   8 + (((imm) >> 0) & 0x3), \
-                                   8 + (((imm) >> 2) & 0x3), \
-                                   8 + (((imm) >> 4) & 0x3), \
-                                   8 + (((imm) >> 6) & 0x3), \
-                                   12, 13, 14, 15, \
-                                   16 + (((imm) >> 0) & 0x3), \
-                                   16 + (((imm) >> 2) & 0x3), \
-                                   16 + (((imm) >> 4) & 0x3), \
-                                   16 + (((imm) >> 6) & 0x3), \
-                                   20, 21, 22, 23, \
-                                   24 + (((imm) >> 0) & 0x3), \
-                                   24 + (((imm) >> 2) & 0x3), \
-                                   24 + (((imm) >> 4) & 0x3), \
-                                   24 + (((imm) >> 6) & 0x3), \
-                                   28, 29, 30, 31); })
+#define _mm512_shufflelo_epi16(A, imm) \
+  (__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))
 
 
-#define _mm512_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
   (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                       (__v32hi)_mm512_shufflelo_epi16((A), \
                                                                       (imm)), \
-                                      (__v32hi)(__m512i)(W)); })
+                                      (__v32hi)(__m512i)(W))
 
 
-#define _mm512_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+#define _mm512_maskz_shufflelo_epi16(U, A, imm) \
   (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                       (__v32hi)_mm512_shufflelo_epi16((A), \
                                                                       (imm)), \
-                                      (__v32hi)_mm512_setzero_hi()); })
+                                      (__v32hi)_mm512_setzero_si512())
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi16 (__m512i __A, __m512i __B)
+_mm512_sllv_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-      __m512i __B)
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
+                                           (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi16 (__m512i __A, __m128i __B)
+_mm512_sll_epi16(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-           __m128i __B)
+_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi) __W,
-             (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sll_epi16(__A, __B),
+                                          (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) __U);
-}
-
-#define _mm512_slli_epi16(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)-1); })
-
-#define _mm512_mask_slli_epi16(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)(__m512i)(W), \
-                                         (__mmask32)(U)); })
-
-#define _mm512_maskz_slli_epi16(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)(U)); })
-
-#define _mm512_bslli_epi128(a, imm) __extension__ ({ \
-  (__m512i)__builtin_shufflevector(                                          \
-       (__v64qi)_mm512_setzero_si512(),                                      \
-       (__v64qi)(__m512i)(a),                                                \
-       ((char)(imm)&0xF0) ?  0 : ((char)(imm)>0x0 ? 16 :  64) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  1 : ((char)(imm)>0x1 ? 17 :  65) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  2 : ((char)(imm)>0x2 ? 18 :  66) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  3 : ((char)(imm)>0x3 ? 19 :  67) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  4 : ((char)(imm)>0x4 ? 20 :  68) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  5 : ((char)(imm)>0x5 ? 21 :  69) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  6 : ((char)(imm)>0x6 ? 22 :  70) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  7 : ((char)(imm)>0x7 ? 23 :  71) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  8 : ((char)(imm)>0x8 ? 24 :  72) - (char)(imm), \
-       ((char)(imm)&0xF0) ?  9 : ((char)(imm)>0x9 ? 25 :  73) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 :  74) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 :  75) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 :  76) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 :  77) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 :  78) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 :  79) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 :  80) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 :  81) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 :  82) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 :  83) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 :  84) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 :  85) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 :  86) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 :  87) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 :  88) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 :  89) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 :  90) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 :  91) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 :  92) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 :  93) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 :  94) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 :  95) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 32 : ((char)(imm)>0x0 ? 48 :  96) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 33 : ((char)(imm)>0x1 ? 49 :  97) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 34 : ((char)(imm)>0x2 ? 50 :  98) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 35 : ((char)(imm)>0x3 ? 51 :  99) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 36 : ((char)(imm)>0x4 ? 52 : 100) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 37 : ((char)(imm)>0x5 ? 53 : 101) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 38 : ((char)(imm)>0x6 ? 54 : 102) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 39 : ((char)(imm)>0x7 ? 55 : 103) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 40 : ((char)(imm)>0x8 ? 56 : 104) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 41 : ((char)(imm)>0x9 ? 57 : 105) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 42 : ((char)(imm)>0xA ? 58 : 106) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 43 : ((char)(imm)>0xB ? 59 : 107) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 44 : ((char)(imm)>0xC ? 60 : 108) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 45 : ((char)(imm)>0xD ? 61 : 109) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 46 : ((char)(imm)>0xE ? 62 : 110) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 47 : ((char)(imm)>0xF ? 63 : 111) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 48 : ((char)(imm)>0x0 ? 64 : 112) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 49 : ((char)(imm)>0x1 ? 65 : 113) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 50 : ((char)(imm)>0x2 ? 66 : 114) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 51 : ((char)(imm)>0x3 ? 67 : 115) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 52 : ((char)(imm)>0x4 ? 68 : 116) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 53 : ((char)(imm)>0x5 ? 69 : 117) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 54 : ((char)(imm)>0x6 ? 70 : 118) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 55 : ((char)(imm)>0x7 ? 71 : 119) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 56 : ((char)(imm)>0x8 ? 72 : 120) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 57 : ((char)(imm)>0x9 ? 73 : 121) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 58 : ((char)(imm)>0xA ? 74 : 122) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 59 : ((char)(imm)>0xB ? 75 : 123) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 60 : ((char)(imm)>0xC ? 76 : 124) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 61 : ((char)(imm)>0xD ? 77 : 125) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 62 : ((char)(imm)>0xE ? 78 : 126) - (char)(imm), \
-       ((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); })
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srlv_epi16 (__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sll_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-      __m512i __B)
+_mm512_slli_epi16(__m512i __A, int __B)
 {
-  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
 {
-  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_slli_epi16(__A, __B),
+                                         (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi16 (__m512i __A, __m512i __B)
+_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
 {
-  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_slli_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_si512());
+}
+
+#define _mm512_bslli_epi128(a, imm) \
+  (__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi16(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-      __m512i __B)
+_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
+                                           (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi16 (__m512i __A, __m128i __B)
+_mm512_srav_epi16(__m512i __A, __m512i __B)
 {
- return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) -1);
+  return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-           __m128i __B)
+_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi) __W,
-            (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srav_epi16(__A, __B),
+                                           (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-            (__mmask32) __U);
-}
-
-#define _mm512_srai_epi16(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)-1); })
-
-#define _mm512_mask_srai_epi16(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)(__m512i)(W), \
-                                         (__mmask32)(U)); })
-
-#define _mm512_maskz_srai_epi16(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)(U)); })
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi16 (__m512i __A, __m128i __B)
-{
-  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) -1);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srav_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-           __m128i __B)
+_mm512_sra_epi16(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi) __W,
-             (__mmask32) __U);
+  return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
-             (__v8hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
-             (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sra_epi16(__A, __B),
+                                          (__v32hi)__W);
 }
 
-#define _mm512_srli_epi16(A, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sra_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_si512());
+}
 
-#define _mm512_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
-                                         (__v32hi)(__m512i)(W), \
-                                         (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+}
 
-#define _mm512_maskz_srli_epi16(U, A, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
-                                         (__v32hi)_mm512_setzero_hi(), \
-                                         (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srai_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
 
-#define _mm512_bsrli_epi128(a, imm) __extension__ ({ \
-  (__m512i)__builtin_shufflevector(                     \
-      (__v64qi)(__m512i)(a),                      \
-      (__v64qi)_mm512_setzero_si512(),            \
-      ((char)(imm)&0xF0) ?  64 : (char)(imm) + ((char)(imm)>0xF ?  48 : 0),  \
-      ((char)(imm)&0xF0) ?  65 : (char)(imm) + ((char)(imm)>0xE ?  49 : 1),  \
-      ((char)(imm)&0xF0) ?  66 : (char)(imm) + ((char)(imm)>0xD ?  50 : 2),  \
-      ((char)(imm)&0xF0) ?  67 : (char)(imm) + ((char)(imm)>0xC ?  51 : 3),  \
-      ((char)(imm)&0xF0) ?  68 : (char)(imm) + ((char)(imm)>0xB ?  52 : 4),  \
-      ((char)(imm)&0xF0) ?  69 : (char)(imm) + ((char)(imm)>0xA ?  53 : 5),  \
-      ((char)(imm)&0xF0) ?  70 : (char)(imm) + ((char)(imm)>0x9 ?  54 : 6),  \
-      ((char)(imm)&0xF0) ?  71 : (char)(imm) + ((char)(imm)>0x8 ?  55 : 7),  \
-      ((char)(imm)&0xF0) ?  72 : (char)(imm) + ((char)(imm)>0x7 ?  56 : 8),  \
-      ((char)(imm)&0xF0) ?  73 : (char)(imm) + ((char)(imm)>0x6 ?  57 : 9),  \
-      ((char)(imm)&0xF0) ?  74 : (char)(imm) + ((char)(imm)>0x5 ?  58 : 10), \
-      ((char)(imm)&0xF0) ?  75 : (char)(imm) + ((char)(imm)>0x4 ?  59 : 11), \
-      ((char)(imm)&0xF0) ?  76 : (char)(imm) + ((char)(imm)>0x3 ?  60 : 12), \
-      ((char)(imm)&0xF0) ?  77 : (char)(imm) + ((char)(imm)>0x2 ?  61 : 13), \
-      ((char)(imm)&0xF0) ?  78 : (char)(imm) + ((char)(imm)>0x1 ?  62 : 14), \
-      ((char)(imm)&0xF0) ?  79 : (char)(imm) + ((char)(imm)>0x0 ?  63 : 15), \
-      ((char)(imm)&0xF0) ?  80 : (char)(imm) + ((char)(imm)>0xF ?  64 : 16), \
-      ((char)(imm)&0xF0) ?  81 : (char)(imm) + ((char)(imm)>0xE ?  65 : 17), \
-      ((char)(imm)&0xF0) ?  82 : (char)(imm) + ((char)(imm)>0xD ?  66 : 18), \
-      ((char)(imm)&0xF0) ?  83 : (char)(imm) + ((char)(imm)>0xC ?  67 : 19), \
-      ((char)(imm)&0xF0) ?  84 : (char)(imm) + ((char)(imm)>0xB ?  68 : 20), \
-      ((char)(imm)&0xF0) ?  85 : (char)(imm) + ((char)(imm)>0xA ?  69 : 21), \
-      ((char)(imm)&0xF0) ?  86 : (char)(imm) + ((char)(imm)>0x9 ?  70 : 22), \
-      ((char)(imm)&0xF0) ?  87 : (char)(imm) + ((char)(imm)>0x8 ?  71 : 23), \
-      ((char)(imm)&0xF0) ?  88 : (char)(imm) + ((char)(imm)>0x7 ?  72 : 24), \
-      ((char)(imm)&0xF0) ?  89 : (char)(imm) + ((char)(imm)>0x6 ?  73 : 25), \
-      ((char)(imm)&0xF0) ?  90 : (char)(imm) + ((char)(imm)>0x5 ?  74 : 26), \
-      ((char)(imm)&0xF0) ?  91 : (char)(imm) + ((char)(imm)>0x4 ?  75 : 27), \
-      ((char)(imm)&0xF0) ?  92 : (char)(imm) + ((char)(imm)>0x3 ?  76 : 28), \
-      ((char)(imm)&0xF0) ?  93 : (char)(imm) + ((char)(imm)>0x2 ?  77 : 29), \
-      ((char)(imm)&0xF0) ?  94 : (char)(imm) + ((char)(imm)>0x1 ?  78 : 30), \
-      ((char)(imm)&0xF0) ?  95 : (char)(imm) + ((char)(imm)>0x0 ?  79 : 31), \
-      ((char)(imm)&0xF0) ?  96 : (char)(imm) + ((char)(imm)>0xF ?  80 : 32), \
-      ((char)(imm)&0xF0) ?  97 : (char)(imm) + ((char)(imm)>0xE ?  81 : 33), \
-      ((char)(imm)&0xF0) ?  98 : (char)(imm) + ((char)(imm)>0xD ?  82 : 34), \
-      ((char)(imm)&0xF0) ?  99 : (char)(imm) + ((char)(imm)>0xC ?  83 : 35), \
-      ((char)(imm)&0xF0) ? 100 : (char)(imm) + ((char)(imm)>0xB ?  84 : 36), \
-      ((char)(imm)&0xF0) ? 101 : (char)(imm) + ((char)(imm)>0xA ?  85 : 37), \
-      ((char)(imm)&0xF0) ? 102 : (char)(imm) + ((char)(imm)>0x9 ?  86 : 38), \
-      ((char)(imm)&0xF0) ? 103 : (char)(imm) + ((char)(imm)>0x8 ?  87 : 39), \
-      ((char)(imm)&0xF0) ? 104 : (char)(imm) + ((char)(imm)>0x7 ?  88 : 40), \
-      ((char)(imm)&0xF0) ? 105 : (char)(imm) + ((char)(imm)>0x6 ?  89 : 41), \
-      ((char)(imm)&0xF0) ? 106 : (char)(imm) + ((char)(imm)>0x5 ?  90 : 42), \
-      ((char)(imm)&0xF0) ? 107 : (char)(imm) + ((char)(imm)>0x4 ?  91 : 43), \
-      ((char)(imm)&0xF0) ? 108 : (char)(imm) + ((char)(imm)>0x3 ?  92 : 44), \
-      ((char)(imm)&0xF0) ? 109 : (char)(imm) + ((char)(imm)>0x2 ?  93 : 45), \
-      ((char)(imm)&0xF0) ? 110 : (char)(imm) + ((char)(imm)>0x1 ?  94 : 46), \
-      ((char)(imm)&0xF0) ? 111 : (char)(imm) + ((char)(imm)>0x0 ?  95 : 47), \
-      ((char)(imm)&0xF0) ? 112 : (char)(imm) + ((char)(imm)>0xF ?  96 : 48), \
-      ((char)(imm)&0xF0) ? 113 : (char)(imm) + ((char)(imm)>0xE ?  97 : 49), \
-      ((char)(imm)&0xF0) ? 114 : (char)(imm) + ((char)(imm)>0xD ?  98 : 50), \
-      ((char)(imm)&0xF0) ? 115 : (char)(imm) + ((char)(imm)>0xC ?  99 : 51), \
-      ((char)(imm)&0xF0) ? 116 : (char)(imm) + ((char)(imm)>0xB ? 100 : 52), \
-      ((char)(imm)&0xF0) ? 117 : (char)(imm) + ((char)(imm)>0xA ? 101 : 53), \
-      ((char)(imm)&0xF0) ? 118 : (char)(imm) + ((char)(imm)>0x9 ? 102 : 54), \
-      ((char)(imm)&0xF0) ? 119 : (char)(imm) + ((char)(imm)>0x8 ? 103 : 55), \
-      ((char)(imm)&0xF0) ? 120 : (char)(imm) + ((char)(imm)>0x7 ? 104 : 56), \
-      ((char)(imm)&0xF0) ? 121 : (char)(imm) + ((char)(imm)>0x6 ? 105 : 57), \
-      ((char)(imm)&0xF0) ? 122 : (char)(imm) + ((char)(imm)>0x5 ? 106 : 58), \
-      ((char)(imm)&0xF0) ? 123 : (char)(imm) + ((char)(imm)>0x4 ? 107 : 59), \
-      ((char)(imm)&0xF0) ? 124 : (char)(imm) + ((char)(imm)>0x3 ? 108 : 60), \
-      ((char)(imm)&0xF0) ? 125 : (char)(imm) + ((char)(imm)>0x2 ? 109 : 61), \
-      ((char)(imm)&0xF0) ? 126 : (char)(imm) + ((char)(imm)>0x1 ? 110 : 62), \
-      ((char)(imm)&0xF0) ? 127 : (char)(imm) + ((char)(imm)>0x0 ? 111 : 63)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srai_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi16(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_srl_epi16(__A, __B),
+                                          (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_srl_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_si512());
+}
+
+#define _mm512_bsrli_epi128(a, imm) \
+  (__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
@@ -2080,7 +1537,7 @@
 {
   return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
                 (__v32hi) __A,
-                (__v32hi) _mm512_setzero_hi ());
+                (__v32hi) _mm512_setzero_si512 ());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -2096,24 +1553,23 @@
 {
   return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
                 (__v64qi) __A,
-                (__v64qi) _mm512_setzero_hi ());
+                (__v64qi) _mm512_setzero_si512 ());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
-                 (__v64qi) __O,
-                 __M);
+  return (__m512i) __builtin_ia32_selectb_512(__M,
+                                              (__v64qi)_mm512_set1_epi8(__A),
+                                              (__v64qi) __O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
-                 (__v64qi)
-                 _mm512_setzero_qi(),
-                 __M);
+  return (__m512i) __builtin_ia32_selectb_512(__M,
+                                              (__v64qi) _mm512_set1_epi8(__A),
+                                              (__v64qi) _mm512_setzero_si512());
 }
 
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
@@ -2143,7 +1599,7 @@
 {
   return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
                  (__v32hi)
-                 _mm512_setzero_hi (),
+                 _mm512_setzero_si512 (),
                  (__mmask32) __U);
 }
 
@@ -2160,7 +1616,7 @@
 {
   return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
                  (__v64qi)
-                 _mm512_setzero_hi (),
+                 _mm512_setzero_si512 (),
                  (__mmask64) __U);
 }
 static __inline__ void __DEFAULT_FN_ATTRS
@@ -2182,61 +1638,56 @@
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_test_epi8_mask (__m512i __A, __m512i __B)
 {
-  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
-            (__v64qi) __B,
-            (__mmask64) -1);
+  return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B),
+                                  _mm512_setzero_si512());
 }
 
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
-            (__v64qi) __B, __U);
+  return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
+                                       _mm512_setzero_si512());
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_test_epi16_mask (__m512i __A, __m512i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
-            (__v32hi) __B,
-            (__mmask32) -1);
+  return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B),
+                                   _mm512_setzero_si512());
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
-            (__v32hi) __B, __U);
+  return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
+                                        _mm512_setzero_si512());
 }
 
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_testn_epi8_mask (__m512i __A, __m512i __B)
 {
-  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
-             (__v64qi) __B,
-             (__mmask64) -1);
+  return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512());
 }
 
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
-             (__v64qi) __B, __U);
+  return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
+                                      _mm512_setzero_si512());
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_testn_epi16_mask (__m512i __A, __m512i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
-             (__v32hi) __B,
-             (__mmask32) -1);
+  return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B),
+                                  _mm512_setzero_si512());
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
-             (__v32hi) __B, __U);
+  return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
+                                       _mm512_setzero_si512());
 }
 
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
@@ -2266,8 +1717,7 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastb_epi8 (__m128i __A)
 {
-  return (__m512i)__builtin_shufflevector((__v16qi) __A,
-                                          (__v16qi)_mm_undefined_si128(),
+  return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -2293,24 +1743,23 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
-                 (__v32hi) __O,
-                 __M);
+  return (__m512i) __builtin_ia32_selectw_512(__M,
+                                              (__v32hi) _mm512_set1_epi16(__A),
+                                              (__v32hi) __O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
-                 (__v32hi) _mm512_setzero_hi(),
-                 __M);
+  return (__m512i) __builtin_ia32_selectw_512(__M,
+                                              (__v32hi) _mm512_set1_epi16(__A),
+                                              (__v32hi) _mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastw_epi16 (__m128i __A)
 {
-  return (__m512i)__builtin_shufflevector((__v8hi) __A,
-                                          (__v8hi)_mm_undefined_si128(),
+  return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
@@ -2334,67 +1783,54 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
-                 (__v32hi) __A,
-                 (__v32hi) _mm512_undefined_epi32 (),
-                 (__mmask32) -1);
+  return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
         __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
-                 (__v32hi) __A,
-                 (__v32hi) _mm512_setzero_hi(),
-                 (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                    (__v32hi)_mm512_permutexvar_epi16(__A, __B),
+                                    (__v32hi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
              __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
-                 (__v32hi) __A,
-                 (__v32hi) __W,
-                 (__mmask32) __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                    (__v32hi)_mm512_permutexvar_epi16(__A, __B),
+                                    (__v32hi)__W);
 }
 
-#define _mm512_alignr_epi8(A, B, N) __extension__ ({\
-  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
-                                          (__v64qi)(__m512i)(B), (int)(N), \
-                                          (__v64qi)_mm512_undefined_pd(), \
-                                          (__mmask64)-1); })
+#define _mm512_alignr_epi8(A, B, N) \
+  (__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
+                                     (__v64qi)(__m512i)(B), (int)(N))
 
-#define _mm512_mask_alignr_epi8(W, U, A, B, N) __extension__({\
-  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
-                                          (__v64qi)(__m512i)(B), (int)(N), \
-                                          (__v64qi)(__m512i)(W), \
-                                          (__mmask64)(U)); })
+#define _mm512_mask_alignr_epi8(W, U, A, B, N) \
+  (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+                             (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
+                             (__v64qi)(__m512i)(W))
 
-#define _mm512_maskz_alignr_epi8(U, A, B, N) __extension__({\
-  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
-                                          (__v64qi)(__m512i)(B), (int)(N), \
-                                          (__v64qi)_mm512_setzero_si512(), \
-                                          (__mmask64)(U)); })
+#define _mm512_maskz_alignr_epi8(U, A, B, N) \
+  (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+                              (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
+                              (__v64qi)(__m512i)_mm512_setzero_si512())
 
-#define _mm512_dbsad_epu8(A, B, imm) __extension__ ({\
-  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
-                                           (__v64qi)(__m512i)(B), (int)(imm), \
-                                           (__v32hi)_mm512_undefined_epi32(), \
-                                           (__mmask32)-1); })
+#define _mm512_dbsad_epu8(A, B, imm) \
+  (__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
+                                      (__v64qi)(__m512i)(B), (int)(imm))
 
-#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) ({\
-  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
-                                           (__v64qi)(__m512i)(B), (int)(imm), \
-                                           (__v32hi)(__m512i)(W), \
-                                           (__mmask32)(U)); })
+#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                  (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
+                                  (__v32hi)(__m512i)(W))
 
-#define _mm512_maskz_dbsad_epu8(U, A, B, imm) ({\
-  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
-                                           (__v64qi)(__m512i)(B), (int)(imm), \
-                                           (__v32hi)_mm512_setzero_hi(), \
-                                           (__mmask32)(U)); })
+#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                  (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
+                                  (__v32hi)_mm512_setzero_si512())
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sad_epu8 (__m512i __A, __m512i __B)
diff --git a/darwin-x86/clang-headers/avx512cdintrin.h b/darwin-x86/clang-headers/avx512cdintrin.h
index 23c4235..e639027 100644
--- a/darwin-x86/clang-headers/avx512cdintrin.h
+++ b/darwin-x86/clang-headers/avx512cdintrin.h
@@ -29,7 +29,7 @@
 #define __AVX512CDINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_conflict_epi64 (__m512i __A)
@@ -82,61 +82,58 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_lzcnt_epi32 (__m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
-             (__v16si) _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
-                 (__v16si) __W,
-                 (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_lzcnt_epi32(__A),
+                                             (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
-             (__v16si) _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_lzcnt_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_lzcnt_epi64 (__m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
-             (__v8di) _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
-                 (__v8di) __W,
-                 (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_lzcnt_epi64(__A),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
-             (__v8di) _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_lzcnt_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastmb_epi64 (__mmask8 __A)
 {
-  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+  return (__m512i) _mm512_set1_epi64((long long) __A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastmw_epi32 (__mmask16 __A)
 {
-  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+  return (__m512i) _mm512_set1_epi32((int) __A);
+
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/darwin-x86/clang-headers/avx512dqintrin.h b/darwin-x86/clang-headers/avx512dqintrin.h
index 13665e4..8a00b3a 100644
--- a/darwin-x86/clang-headers/avx512dqintrin.h
+++ b/darwin-x86/clang-headers/avx512dqintrin.h
@@ -29,7 +29,7 @@
 #define __AVX512DQINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mullo_epi64 (__m512i __A, __m512i __B) {
@@ -37,204 +37,169 @@
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di) __W,
-              (__mmask8) __U);
+_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullo_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
-  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) __U);
+_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullo_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_xor_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8du) __A ^ (__v8du) __B);
+_mm512_xor_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A ^ (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U);
+_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_xor_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U);
+_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_xor_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_xor_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16su) __A ^ (__v16su) __B);
+  return (__m512)((__v16su)__A ^ (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U);
+_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_xor_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U);
+_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_xor_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_or_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8du) __A | (__v8du) __B);
+_mm512_or_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A | (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
-            (__v8df) __B,
-            (__v8df) __W,
-            (__mmask8) __U);
+_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_or_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
-            (__v8df) __B,
-            (__v8df)
-            _mm512_setzero_pd (),
-            (__mmask8) __U);
+_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_or_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_or_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16su) __A | (__v16su) __B);
+_mm512_or_ps(__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A | (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
-                 (__v16sf) __B,
-                 (__v16sf) __W,
-                 (__mmask16) __U);
+_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_or_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
-                 (__v16sf) __B,
-                 (__v16sf)
-                 _mm512_setzero_ps (),
-                 (__mmask16) __U);
+_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_or_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_and_pd (__m512d __A, __m512d __B) {
-  return (__m512d) ((__v8du) __A & (__v8du) __B);
+_mm512_and_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A & (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U);
+_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_and_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U);
+_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_and_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_and_ps (__m512 __A, __m512 __B) {
-  return (__m512) ((__v16su) __A & (__v16su) __B);
+_mm512_and_ps(__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A & (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U);
+_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_and_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U);
+_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_and_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_andnot_pd (__m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
-              (__v8df) __B,
-              (__v8df)
-              _mm512_setzero_pd (),
-              (__mmask8) -1);
+_mm512_andnot_pd(__m512d __A, __m512d __B) {
+  return (__m512d)(~(__v8du)__A & (__v8du)__B);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
-              (__v8df) __B,
-              (__v8df) __W,
-              (__mmask8) __U);
+_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_andnot_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
-              (__v8df) __B,
-              (__v8df)
-              _mm512_setzero_pd (),
-              (__mmask8) __U);
+_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_andnot_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_andnot_ps (__m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
-             (__v16sf) __B,
-             (__v16sf)
-             _mm512_setzero_ps (),
-             (__mmask16) -1);
+_mm512_andnot_ps(__m512 __A, __m512 __B) {
+  return (__m512)(~(__v16su)__A & (__v16su)__B);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
-             (__v16sf) __B,
-             (__v16sf) __W,
-             (__mmask16) __U);
+_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_andnot_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
-             (__v16sf) __B,
-             (__v16sf)
-             _mm512_setzero_ps (),
-             (__mmask16) __U);
+_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_andnot_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -261,20 +226,20 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundpd_epi64(A, R) __extension__ ({              \
+#define _mm512_cvt_roundpd_epi64(A, R) \
   (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
                                            (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
   (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
                                            (__v8di)(__m512i)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) __extension__ ({   \
+#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
   (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
                                            (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvtpd_epu64 (__m512d __A) {
@@ -300,20 +265,20 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundpd_epu64(A, R) __extension__ ({               \
+#define _mm512_cvt_roundpd_epu64(A, R) \
   (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
   (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) __extension__ ({     \
+#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
   (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvtps_epi64 (__m256 __A) {
@@ -339,20 +304,20 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundps_epi64(A, R) __extension__ ({             \
+#define _mm512_cvt_roundps_epi64(A, R) \
   (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
                                            (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
   (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
                                            (__v8di)(__m512i)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundps_epi64(U, A, R) __extension__ ({   \
+#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
   (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
                                            (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvtps_epu64 (__m256 __A) {
@@ -378,60 +343,55 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundps_epu64(A, R) __extension__ ({              \
+#define _mm512_cvt_roundps_epu64(A, R) \
   (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
   (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundps_epu64(U, A, R) __extension__ ({   \
+#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
   (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepi64_pd (__m512i __A) {
-  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
-                (__v8df) _mm512_setzero_pd(),
-                (__mmask8) -1,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_convertvector((__v8di)__A, __v8df);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
-  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
-                (__v8df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_cvtepi64_pd(__A),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
-  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
-                (__v8df) _mm512_setzero_pd(),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_cvtepi64_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-#define _mm512_cvt_roundepi64_pd(A, R) __extension__ ({          \
+#define _mm512_cvt_roundepi64_pd(A, R) \
   (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
   (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
                                            (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
   (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm512_cvtepi64_ps (__m512i __A) {
@@ -457,20 +417,20 @@
                _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundepi64_ps(A, R) __extension__ ({        \
+#define _mm512_cvt_roundepi64_ps(A, R) \
   (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
   (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
   (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -497,20 +457,20 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundpd_epi64(A, R) __extension__ ({             \
+#define _mm512_cvtt_roundpd_epi64(A, R) \
   (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
   (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
   (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvttpd_epu64 (__m512d __A) {
@@ -536,20 +496,20 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundpd_epu64(A, R) __extension__ ({              \
+#define _mm512_cvtt_roundpd_epu64(A, R) \
   (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
                                              (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R)); })
+                                             (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
   (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
                                              (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) __extension__ ({   \
+#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
   (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
                                              (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvttps_epi64 (__m256 __A) {
@@ -575,20 +535,20 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundps_epi64(A, R) __extension__ ({            \
+#define _mm512_cvtt_roundps_epi64(A, R) \
   (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
   (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) __extension__ ({  \
+#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
   (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_cvttps_epu64 (__m256 __A) {
@@ -614,60 +574,55 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundps_epu64(A, R) __extension__ ({            \
+#define _mm512_cvtt_roundps_epu64(A, R) \
   (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
                                              (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R)); })
+                                             (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
   (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
                                              (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) __extension__ ({  \
+#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
   (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
                                              (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepu64_pd (__m512i __A) {
-  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
-                 (__v8df) _mm512_setzero_pd(),
-                 (__mmask8) -1,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_convertvector((__v8du)__A, __v8df);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
-  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
-                 (__v8df) __W,
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_cvtepu64_pd(__A),
+                                              (__v8df)__W);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
-  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
-                 (__v8df) _mm512_setzero_pd(),
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_cvtepu64_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-#define _mm512_cvt_roundepu64_pd(A, R) __extension__ ({          \
+#define _mm512_cvt_roundepu64_pd(A, R) \
   (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
                                             (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
   (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
                                             (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
-#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
   (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
                                             (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
@@ -694,292 +649,292 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundepu64_ps(A, R) __extension__ ({         \
+#define _mm512_cvt_roundepu64_ps(A, R) \
   (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
                                            (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
   (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
                                            (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                           (int)(R)); })
+                                           (int)(R))
 
-#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
   (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
                                            (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_range_pd(A, B, C) __extension__ ({                     \
+#define _mm512_range_pd(A, B, C) \
   (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)_mm512_setzero_pd(), \
                                           (__mmask8)-1, \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_range_pd(W, U, A, B, C) __extension__ ({      \
+#define _mm512_mask_range_pd(W, U, A, B, C) \
   (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_range_pd(U, A, B, C) __extension__ ({           \
+#define _mm512_maskz_range_pd(U, A, B, C) \
   (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)_mm512_setzero_pd(), \
                                           (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_range_round_pd(A, B, C, R) __extension__ ({           \
+#define _mm512_range_round_pd(A, B, C, R) \
   (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_range_round_pd(W, U, A, B, C, R) __extension__ ({ \
+#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
   (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm512_maskz_range_round_pd(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_range_round_pd(U, A, B, C, R) \
   (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-#define _mm512_range_ps(A, B, C) __extension__ ({                       \
+#define _mm512_range_ps(A, B, C) \
   (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)_mm512_setzero_ps(), \
                                          (__mmask16)-1, \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_range_ps(W, U, A, B, C) __extension__ ({         \
+#define _mm512_mask_range_ps(W, U, A, B, C) \
   (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_range_ps(U, A, B, C) __extension__ ({      \
+#define _mm512_maskz_range_ps(U, A, B, C) \
   (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)_mm512_setzero_ps(), \
                                          (__mmask16)(U), \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_range_round_ps(A, B, C, R) __extension__ ({         \
+#define _mm512_range_round_ps(A, B, C, R) \
   (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, (int)(R)); })
+                                         (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_range_round_ps(W, U, A, B, C, R) __extension__ ({ \
+#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
   (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                         (int)(R)); })
+                                         (int)(R))
 
-#define _mm512_maskz_range_round_ps(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_range_round_ps(U, A, B, C, R) \
   (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(U), (int)(R)); })
+                                         (__mmask16)(U), (int)(R))
 
-#define _mm_range_round_ss(A, B, C, R) __extension__ ({           \
+#define _mm_range_round_ss(A, B, C, R) \
   (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8) -1, (int)(C),\
-                                               (int)(R)); })
+                                               (int)(R))
 
 #define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_range_round_ss(W, U, A, B, C, R) __extension__ ({ \
+#define _mm_mask_range_round_ss(W, U, A, B, C, R) \
   (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)(__m128)(W),\
                                                (__mmask8)(U), (int)(C),\
-                                               (int)(R)); })
+                                               (int)(R))
 
 #define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_range_round_ss(U, A, B, C, R) __extension__ ({ \
+#define _mm_maskz_range_round_ss(U, A, B, C, R) \
   (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8)(U), (int)(C),\
-                                               (int)(R)); })
+                                               (int)(R))
 
 #define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm_range_round_sd(A, B, C, R) __extension__ ({           \
+#define _mm_range_round_sd(A, B, C, R) \
   (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)_mm_setzero_pd(), \
                                                 (__mmask8) -1, (int)(C),\
-                                                (int)(R)); })
+                                                (int)(R))
 
 #define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_range_round_sd(W, U, A, B, C, R) __extension__ ({ \
+#define _mm_mask_range_round_sd(W, U, A, B, C, R) \
   (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)(__m128d)(W),\
                                                 (__mmask8)(U), (int)(C),\
-                                                (int)(R)); })
+                                                (int)(R))
 
 #define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_range_round_sd(U, A, B, C, R) __extension__ ({ \
+#define _mm_maskz_range_round_sd(U, A, B, C, R) \
   (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)_mm_setzero_pd(), \
                                                 (__mmask8)(U), (int)(C),\
-                                                (int)(R)); })
+                                                (int)(R))
 
 #define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_reduce_pd(A, B) __extension__ ({             \
+#define _mm512_reduce_pd(A, B) \
   (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
                                            (__mmask8)-1, \
-                                           _MM_FROUND_CUR_DIRECTION); })
+                                           _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+#define _mm512_mask_reduce_pd(W, U, A, B) \
   (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)(__m512d)(W), \
                                            (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION); })
+                                           _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_reduce_pd(U, A, B) __extension__ ({  \
+#define _mm512_maskz_reduce_pd(U, A, B) \
   (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
                                            (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION); })
+                                           _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_reduce_ps(A, B) __extension__ ({              \
+#define _mm512_reduce_ps(A, B) \
   (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
                                           (__mmask16)-1, \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_reduce_ps(W, U, A, B) __extension__ ({   \
+#define _mm512_mask_reduce_ps(W, U, A, B) \
   (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)(__m512)(W), \
                                           (__mmask16)(U), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_reduce_ps(U, A, B) __extension__ ({       \
+#define _mm512_maskz_reduce_ps(U, A, B) \
   (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
                                           (__mmask16)(U), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_reduce_round_pd(A, B, R) __extension__ ({\
+#define _mm512_reduce_round_pd(A, B, R) \
   (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_reduce_round_pd(W, U, A, B, R) __extension__ ({\
+#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
   (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_reduce_round_pd(U, A, B, R) __extension__ ({\
+#define _mm512_maskz_reduce_round_pd(U, A, B, R) \
   (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_reduce_round_ps(A, B, R) __extension__ ({\
+#define _mm512_reduce_round_ps(A, B, R) \
   (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, (int)(R)); })
+                                          (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_reduce_round_ps(W, U, A, B, R) __extension__ ({\
+#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
   (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_reduce_round_ps(U, A, B, R) __extension__ ({\
+#define _mm512_maskz_reduce_round_ps(U, A, B, R) \
   (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
-#define _mm_reduce_ss(A, B, C) __extension__ ({              \
+#define _mm_reduce_ss(A, B, C) \
   (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
-                                       (int)(C), _MM_FROUND_CUR_DIRECTION); })
+                                       (int)(C), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_reduce_ss(W, U, A, B, C) __extension__ ({   \
+#define _mm_mask_reduce_ss(W, U, A, B, C) \
   (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                       (int)(C), _MM_FROUND_CUR_DIRECTION); })
+                                       (int)(C), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_reduce_ss(U, A, B, C) __extension__ ({       \
+#define _mm_maskz_reduce_ss(U, A, B, C) \
   (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)_mm_setzero_ps(), \
                                        (__mmask8)(U), (int)(C), \
-                                       _MM_FROUND_CUR_DIRECTION); })
+                                       _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_reduce_round_ss(A, B, C, R) __extension__ ({              \
+#define _mm_reduce_round_ss(A, B, C, R) \
   (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
-                                       (int)(C), (int)(R)); })
+                                       (int)(C), (int)(R))
 
-#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) __extension__ ({   \
+#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
   (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                       (int)(C), (int)(R)); })
+                                       (int)(C), (int)(R))
 
-#define _mm_maskz_reduce_round_ss(U, A, B, C, R) __extension__ ({       \
+#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
   (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)_mm_setzero_ps(), \
-                                       (__mmask8)(U), (int)(C), (int)(R)); })
+                                       (__mmask8)(U), (int)(C), (int)(R))
 
-#define _mm_reduce_sd(A, B, C) __extension__ ({              \
+#define _mm_reduce_sd(A, B, C) \
   (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)_mm_setzero_pd(), \
                                         (__mmask8)-1, (int)(C), \
-                                        _MM_FROUND_CUR_DIRECTION); })
+                                        _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_reduce_sd(W, U, A, B, C) __extension__ ({   \
+#define _mm_mask_reduce_sd(W, U, A, B, C) \
   (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                        (int)(C), _MM_FROUND_CUR_DIRECTION); })
+                                        (int)(C), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_reduce_sd(U, A, B, C) __extension__ ({       \
+#define _mm_maskz_reduce_sd(U, A, B, C) \
   (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)_mm_setzero_pd(), \
                                         (__mmask8)(U), (int)(C), \
-                                        _MM_FROUND_CUR_DIRECTION); })
+                                        _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_reduce_round_sd(A, B, C, R) __extension__ ({              \
+#define _mm_reduce_round_sd(A, B, C, R) \
   (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)_mm_setzero_pd(), \
-                                        (__mmask8)-1, (int)(C), (int)(R)); })
+                                        (__mmask8)-1, (int)(C), (int)(R))
 
-#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) __extension__ ({   \
+#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
   (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                        (int)(C), (int)(R)); })
+                                        (int)(C), (int)(R))
 
-#define _mm_maskz_reduce_round_sd(U, A, B, C, R) __extension__ ({       \
+#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
   (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)_mm_setzero_pd(), \
-                                        (__mmask8)(U), (int)(C), (int)(R)); })
-                     
+                                        (__mmask8)(U), (int)(C), (int)(R))
+
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_movepi32_mask (__m512i __A)
 {
@@ -1008,323 +963,298 @@
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_broadcast_f32x2 (__m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
-                (__v16sf)_mm512_undefined_ps(),
-                (__mmask16) -1);
+  return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+                                         0, 1, 0, 1, 0, 1, 0, 1,
+                                         0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
-                (__v16sf)
-                __O, __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                             (__v16sf)_mm512_broadcast_f32x2(__A),
+                                             (__v16sf)__O);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
-                (__v16sf)_mm512_setzero_ps (),
-                __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                             (__v16sf)_mm512_broadcast_f32x2(__A),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_broadcast_f32x8 (__m256 __A)
+_mm512_broadcast_f32x8(__m256 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
-                _mm512_undefined_ps(),
-                (__mmask16) -1);
+  return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
+                                         0, 1, 2, 3, 4, 5, 6, 7,
+                                         0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
+_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
-                (__v16sf)__O,
-                __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                           (__v16sf)_mm512_broadcast_f32x8(__A),
+                                           (__v16sf)__O);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
+_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
-                (__v16sf)_mm512_setzero_ps (),
-                __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                           (__v16sf)_mm512_broadcast_f32x8(__A),
+                                           (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_broadcast_f64x2 (__m128d __A)
+_mm512_broadcast_f64x2(__m128d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
-                 (__v8df)_mm512_undefined_pd(),
-                 (__mmask8) -1);
+  return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
+                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
-                 (__v8df)
-                 __O, __M);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+                                            (__v8df)_mm512_broadcast_f64x2(__A),
+                                            (__v8df)__O);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
-                 (__v8df)_mm512_setzero_ps (),
-                 __M);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+                                            (__v8df)_mm512_broadcast_f64x2(__A),
+                                            (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcast_i32x2 (__m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
-                 (__v16si)_mm512_setzero_si512(),
-                 (__mmask16) -1);
+  return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+                                          0, 1, 0, 1, 0, 1, 0, 1,
+                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
-                 (__v16si)
-                 __O, __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_broadcast_i32x2(__A),
+                                             (__v16si)__O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
-                 (__v16si)_mm512_setzero_si512 (),
-                 __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_broadcast_i32x2(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcast_i32x8 (__m256i __A)
+_mm512_broadcast_i32x8(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
-                 (__v16si)_mm512_setzero_si512(),
-                 (__mmask16) -1);
+  return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
+                                          0, 1, 2, 3, 4, 5, 6, 7,
+                                          0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
+_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
-                 (__v16si)__O,
-                 __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                           (__v16si)_mm512_broadcast_i32x8(__A),
+                                           (__v16si)__O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
+_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
-                 (__v16si)
-                 _mm512_setzero_si512 (),
-                 __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                           (__v16si)_mm512_broadcast_i32x8(__A),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcast_i64x2 (__m128i __A)
+_mm512_broadcast_i64x2(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
-                 (__v8di)_mm512_setzero_si512(),
-                 (__mmask8) -1);
+  return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
+                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
-                 (__v8di)
-                 __O, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                            (__v8di)_mm512_broadcast_i64x2(__A),
+                                            (__v8di)__O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
-                 (__v8di)_mm512_setzero_si512 (),
-                 __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                            (__v8di)_mm512_broadcast_i64x2(__A),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
-#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \
+#define _mm512_extractf32x8_ps(A, imm) \
   (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1); })
+                                           (__v8sf)_mm256_undefined_ps(), \
+                                           (__mmask8)-1)
 
-#define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \
+#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
   (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
                                            (__v8sf)(__m256)(W), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \
+#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
   (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
                                            (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \
+#define _mm512_extractf64x2_pd(A, imm) \
   (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
                                                 (int)(imm), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1); })
+                                                (__v2df)_mm_undefined_pd(), \
+                                                (__mmask8)-1)
 
-#define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
+#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
   (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U)); })
+                                                (__mmask8)(U))
 
-#define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
+#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
   (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U)); })
+                                                (__mmask8)(U))
 
-#define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \
+#define _mm512_extracti32x8_epi32(A, imm) \
   (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)-1); })
+                                            (__v8si)_mm256_undefined_si256(), \
+                                            (__mmask8)-1)
 
-#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \
+#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
   (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
                                             (__v8si)(__m256i)(W), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \
+#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
   (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
                                             (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \
+#define _mm512_extracti64x2_epi64(A, imm) \
   (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
                                                 (int)(imm), \
-                                                (__v2di)_mm_setzero_di(), \
-                                                (__mmask8)-1); })
+                                                (__v2di)_mm_undefined_si128(), \
+                                                (__mmask8)-1)
 
-#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
+#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
   (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
                                                 (int)(imm), \
                                                 (__v2di)(__m128i)(W), \
-                                                (__mmask8)(U)); })
+                                                (__mmask8)(U))
 
-#define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
+#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
   (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
                                                 (int)(imm), \
-                                                (__v2di)_mm_setzero_di(), \
-                                                (__mmask8)(U)); })
+                                                (__v2di)_mm_setzero_si128(), \
+                                                (__mmask8)(U))
 
-#define _mm512_insertf32x8(A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
-                                          (__v8sf)(__m256)(B), (int)(imm), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1); })
+#define _mm512_insertf32x8(A, B, imm) \
+  (__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
+                                     (__v8sf)(__m256)(B), (int)(imm))
 
-#define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
-                                          (__v8sf)(__m256)(B), (int)(imm), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U)); })
+#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+                                 (__v16sf)(__m512)(W))
 
-#define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
-                                          (__v8sf)(__m256)(B), (int)(imm), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U)); })
+#define _mm512_maskz_insertf32x8(U, A, B, imm) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+                                 (__v16sf)_mm512_setzero_ps())
 
-#define _mm512_insertf64x2(A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v8df)_mm512_setzero_pd(), \
-                                               (__mmask8)-1); })
+#define _mm512_insertf64x2(A, B, imm) \
+  (__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(imm))
 
-#define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v8df)(__m512d)(W), \
-                                               (__mmask8)(U)); })
+#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+                                  (__v8df)(__m512d)(W))
 
-#define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v8df)_mm512_setzero_pd(), \
-                                               (__mmask8)(U)); })
+#define _mm512_maskz_insertf64x2(U, A, B, imm) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+                                  (__v8df)_mm512_setzero_pd())
 
-#define _mm512_inserti32x8(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
-                                           (__v8si)(__m256i)(B), (int)(imm), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)-1); })
+#define _mm512_inserti32x8(A, B, imm) \
+  (__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
+                                      (__v8si)(__m256i)(B), (int)(imm))
 
-#define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
-                                           (__v8si)(__m256i)(B), (int)(imm), \
-                                           (__v16si)(__m512i)(W), \
-                                           (__mmask16)(U)); })
+#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+                                 (__v16si)(__m512i)(W))
 
-#define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
-                                           (__v8si)(__m256i)(B), (int)(imm), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)(U)); })
+#define _mm512_maskz_inserti32x8(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512())
 
-#define _mm512_inserti64x2(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8di)_mm512_setzero_si512(), \
-                                               (__mmask8)-1); })
+#define _mm512_inserti64x2(A, B, imm) \
+  (__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
+                                          (__v2di)(__m128i)(B), (int)(imm))
 
-#define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8di)(__m512i)(W), \
-                                               (__mmask8)(U)); })
+#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+                                  (__v8di)(__m512i)(W))
 
-#define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8di)_mm512_setzero_si512(), \
-                                               (__mmask8)(U)); })
+#define _mm512_maskz_inserti64x2(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512())
 
-#define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+#define _mm512_mask_fpclass_ps_mask(U, A, imm) \
   (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
-                                              (int)(imm), (__mmask16)(U)); })
+                                              (int)(imm), (__mmask16)(U))
 
-#define _mm512_fpclass_ps_mask(A, imm) __extension__ ({ \
+#define _mm512_fpclass_ps_mask(A, imm) \
   (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
-                                              (int)(imm), (__mmask16)-1); })
+                                              (int)(imm), (__mmask16)-1)
 
-#define _mm512_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+#define _mm512_mask_fpclass_pd_mask(U, A, imm) \
   (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm512_fpclass_pd_mask(A, imm) __extension__ ({ \
+#define _mm512_fpclass_pd_mask(A, imm) \
   (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
-#define _mm_fpclass_sd_mask(A, imm) __extension__ ({ \
+#define _mm_fpclass_sd_mask(A, imm) \
   (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                          (__mmask8)-1); })
+                                          (__mmask8)-1)
 
-#define _mm_mask_fpclass_sd_mask(U, A, imm) __extension__ ({ \
+#define _mm_mask_fpclass_sd_mask(U, A, imm) \
   (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-#define _mm_fpclass_ss_mask(A, imm) __extension__ ({ \
+#define _mm_fpclass_ss_mask(A, imm) \
   (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                          (__mmask8)-1); })
+                                          (__mmask8)-1)
 
-#define _mm_mask_fpclass_ss_mask(U, A, imm) __extension__ ({ \
+#define _mm_mask_fpclass_ss_mask(U, A, imm) \
   (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/darwin-x86/clang-headers/avx512erintrin.h b/darwin-x86/clang-headers/avx512erintrin.h
index 8ff212c..6348275 100644
--- a/darwin-x86/clang-headers/avx512erintrin.h
+++ b/darwin-x86/clang-headers/avx512erintrin.h
@@ -27,21 +27,21 @@
 #ifndef __AVX512ERINTRIN_H
 #define __AVX512ERINTRIN_H
 
-// exp2a23
-#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+/* exp2a23 */
+#define _mm512_exp2a23_round_pd(A, R) \
   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)-1, (int)(R)); })
+                                      (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                      (int)(R)); })
+                                      (int)(R))
 
-#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)(M), (int)(R)); })
+                                      (__mmask8)(M), (int)(R))
 
 #define _mm512_exp2a23_pd(A) \
   _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@@ -52,20 +52,20 @@
 #define _mm512_maskz_exp2a23_pd(M, A) \
   _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+#define _mm512_exp2a23_round_ps(A, R) \
   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask16)-1, (int)(R)); })
+                                     (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                     (int)(R)); })
+                                     (int)(R))
 
-#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
   (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask16)(M), (int)(R)); })
+                                     (__mmask16)(M), (int)(R))
 
 #define _mm512_exp2a23_ps(A) \
   _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@@ -76,21 +76,21 @@
 #define _mm512_maskz_exp2a23_ps(M, A) \
   _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
-// rsqrt28
-#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+/* rsqrt28 */
+#define _mm512_rsqrt28_round_pd(A, R) \
   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)-1, (int)(R)); })
+                                         (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                         (int)(R)); })
+                                         (int)(R))
 
-#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
   (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)(M), (int)(R)); })
+                                         (__mmask8)(M), (int)(R))
 
 #define _mm512_rsqrt28_pd(A) \
   _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@@ -101,20 +101,20 @@
 #define _mm512_maskz_rsqrt28_pd(M, A) \
   _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+#define _mm512_rsqrt28_round_ps(A, R) \
   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)-1, (int)(R)); })
+                                        (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                        (int)(R)); })
+                                        (int)(R))
 
-#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
   (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)(M), (int)(R)); })
+                                        (__mmask16)(M), (int)(R))
 
 #define _mm512_rsqrt28_ps(A) \
   _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@@ -125,23 +125,23 @@
 #define _mm512_maskz_rsqrt28_ps(M, A) \
   _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+#define _mm_rsqrt28_round_ss(A, B, R) \
   (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1, (int)(R)); })
+                                              (__mmask8)-1, (int)(R))
 
-#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
   (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)(__m128)(S), \
-                                              (__mmask8)(M), (int)(R)); })
+                                              (__mmask8)(M), (int)(R))
 
-#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
   (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(M), (int)(R)); })
+                                              (__mmask8)(M), (int)(R))
 
 #define _mm_rsqrt28_ss(A, B) \
   _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -152,23 +152,23 @@
 #define _mm_maskz_rsqrt28_ss(M, A, B) \
   _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+#define _mm_rsqrt28_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1, (int)(R)); })
+                                               (__mmask8)-1, (int)(R))
 
-#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
   (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)(__m128d)(S), \
-                                               (__mmask8)(M), (int)(R)); })
+                                               (__mmask8)(M), (int)(R))
 
-#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
   (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(M), (int)(R)); })
+                                               (__mmask8)(M), (int)(R))
 
 #define _mm_rsqrt28_sd(A, B) \
   _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -179,21 +179,21 @@
 #define _mm_maskz_rsqrt28_sd(M, A, B) \
   _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
-// rcp28
-#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+/* rcp28 */
+#define _mm512_rcp28_round_pd(A, R) \
   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R)); })
+                                       (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R)); })
+                                       (int)(R))
 
-#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+#define _mm512_maskz_rcp28_round_pd(M, A, R) \
   (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R)); })
+                                       (__mmask8)(M), (int)(R))
 
 #define _mm512_rcp28_pd(A) \
   _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@@ -204,20 +204,20 @@
 #define _mm512_maskz_rcp28_pd(M, A) \
   _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+#define _mm512_rcp28_round_ps(A, R) \
   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R)); })
+                                      (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R)); })
+                                      (int)(R))
 
-#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+#define _mm512_maskz_rcp28_round_ps(M, A, R) \
   (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R)); })
+                                      (__mmask16)(M), (int)(R))
 
 #define _mm512_rcp28_ps(A) \
   _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@@ -228,23 +228,23 @@
 #define _mm512_maskz_rcp28_ps(M, A) \
   _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+#define _mm_rcp28_round_ss(A, B, R) \
   (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
   (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)(__m128)(S), \
-                                            (__mmask8)(M), (int)(R)); })
+                                            (__mmask8)(M), (int)(R))
 
-#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
   (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)(M), (int)(R)); })
+                                            (__mmask8)(M), (int)(R))
 
 #define _mm_rcp28_ss(A, B) \
   _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -255,23 +255,23 @@
 #define _mm_maskz_rcp28_ss(M, A, B) \
   _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+#define _mm_rcp28_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)-1, (int)(R)); })
+                                             (__mmask8)-1, (int)(R))
 
-#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
   (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)(__m128d)(S), \
-                                             (__mmask8)(M), (int)(R)); })
+                                             (__mmask8)(M), (int)(R))
 
-#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
   (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)(M), (int)(R)); })
+                                             (__mmask8)(M), (int)(R))
 
 #define _mm_rcp28_sd(A, B) \
   _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -282,4 +282,4 @@
 #define _mm_maskz_rcp28_sd(M, A, B) \
   _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
-#endif // __AVX512ERINTRIN_H
+#endif /* __AVX512ERINTRIN_H */
diff --git a/darwin-x86/clang-headers/avx512fintrin.h b/darwin-x86/clang-headers/avx512fintrin.h
index badc436..8dd4a0a 100644
--- a/darwin-x86/clang-headers/avx512fintrin.h
+++ b/darwin-x86/clang-headers/avx512fintrin.h
@@ -54,6 +54,19 @@
 #define _MM_FROUND_TO_ZERO          0x03
 #define _MM_FROUND_CUR_DIRECTION    0x04
 
+/* Constants for integer comparison predicates */
+typedef enum {
+    _MM_CMPINT_EQ,      /* Equal */
+    _MM_CMPINT_LT,      /* Less than */
+    _MM_CMPINT_LE,      /* Less than or Equal */
+    _MM_CMPINT_UNUSED,
+    _MM_CMPINT_NE,      /* Not Equal */
+    _MM_CMPINT_NLT,     /* Not Less than */
+#define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
+    _MM_CMPINT_NLE      /* Not Less than or Equal */
+#define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
+} _MM_CMPINT_ENUM;
+
 typedef enum
 {
   _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
@@ -160,51 +173,51 @@
 } _MM_MANTISSA_SIGN_ENUM;
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
 
 /* Create vectors with repeated elements */
 
-static  __inline __m512i __DEFAULT_FN_ATTRS
+static  __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_setzero_si512(void)
 {
-  return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+  return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
 #define _mm512_setzero_epi32 _mm512_setzero_si512
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_undefined_pd(void)
 {
   return (__m512d)__builtin_ia32_undef512();
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_undefined(void)
 {
   return (__m512)__builtin_ia32_undef512();
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_undefined_ps(void)
 {
   return (__m512)__builtin_ia32_undef512();
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_undefined_epi32(void)
 {
   return (__m512i)__builtin_ia32_undef512();
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_broadcastd_epi32 (__m128i __A)
 {
-  return (__m512i)__builtin_shufflevector((__v4si) __A,
-                                          (__v4si)_mm_undefined_si128(),
+  return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectd_512(__M,
@@ -212,7 +225,7 @@
                                              (__v16si) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectd_512(__M,
@@ -220,15 +233,14 @@
                                              (__v16si) _mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_broadcastq_epi64 (__m128i __A)
 {
-  return (__m512i)__builtin_shufflevector((__v2di) __A,
-                                          (__v2di) _mm_undefined_si128(),
+  return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectq_512(__M,
@@ -237,7 +249,7 @@
 
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectq_512(__M,
@@ -245,129 +257,123 @@
                                              (__v8di) _mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
-{
-  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
-                 (__v16si)
-                 _mm512_setzero_si512 (),
-                 __M);
-}
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
-{
-#ifdef __x86_64__
-  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
-                 (__v8di)
-                 _mm512_setzero_si512 (),
-                 __M);
-#else
-  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
-                 (__v8di)
-                 _mm512_setzero_si512 (),
-                 __M);
-#endif
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_setzero_ps(void)
 {
-  return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+  return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                                 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
 }
 
 #define _mm512_setzero _mm512_setzero_ps
 
-static  __inline __m512d __DEFAULT_FN_ATTRS
+static  __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_setzero_pd(void)
 {
-  return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+  return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_set1_ps(float __w)
 {
-  return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
-                   __w, __w, __w, __w, __w, __w, __w, __w  };
+  return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
+                                 __w, __w, __w, __w, __w, __w, __w, __w  };
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_set1_pd(double __w)
 {
-  return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
+  return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_set1_epi8(char __w)
 {
-  return (__m512i)(__v64qi){ __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w  };
+  return __extension__ (__m512i)(__v64qi){
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w  };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_set1_epi16(short __w)
 {
-  return (__m512i)(__v32hi){ __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w,
-                             __w, __w, __w, __w, __w, __w, __w, __w };
+  return __extension__ (__m512i)(__v32hi){
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_set1_epi32(int __s)
 {
-  return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
-                             __s, __s, __s, __s, __s, __s, __s, __s };
+  return __extension__ (__m512i)(__v16si){
+    __s, __s, __s, __s, __s, __s, __s, __s,
+    __s, __s, __s, __s, __s, __s, __s, __s };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__M,
+                                             (__v16si)_mm512_set1_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_set1_epi64(long long __d)
 {
-  return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
+  return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                             (__v8di)_mm512_set1_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_broadcastss_ps(__m128 __A)
 {
-  return (__m512)__builtin_shufflevector((__v4sf) __A,
-                                         (__v4sf)_mm_undefined_ps(),
+  return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_set4_epi32 (int __A, int __B, int __C, int __D)
 {
-  return  (__m512i)(__v16si)
+  return __extension__ (__m512i)(__v16si)
    { __D, __C, __B, __A, __D, __C, __B, __A,
      __D, __C, __B, __A, __D, __C, __B, __A };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_set4_epi64 (long long __A, long long __B, long long __C,
        long long __D)
 {
-  return  (__m512i) (__v8di)
+  return __extension__ (__m512i) (__v8di)
    { __D, __C, __B, __A, __D, __C, __B, __A };
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_set4_pd (double __A, double __B, double __C, double __D)
 {
-  return  (__m512d)
+  return __extension__ (__m512d)
    { __D, __C, __B, __A, __D, __C, __B, __A };
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_set4_ps (float __A, float __B, float __C, float __D)
 {
-  return  (__m512)
+  return __extension__ (__m512)
    { __D, __C, __B, __A, __D, __C, __B, __A,
      __D, __C, __B, __A, __D, __C, __B, __A };
 }
@@ -384,133 +390,254 @@
 #define _mm512_setr4_ps(e0,e1,e2,e3)                \
   _mm512_set4_ps((e3),(e2),(e1),(e0))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_broadcastsd_pd(__m128d __A)
 {
-  return (__m512d)__builtin_shufflevector((__v2df) __A,
-                                          (__v2df) _mm_undefined_pd(),
+  return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /* Cast between vector types */
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_castpd256_pd512(__m256d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_castps256_ps512(__m256 __a)
 {
   return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
                                           -1, -1, -1, -1, -1, -1, -1, -1);
 }
 
-static __inline __m128d __DEFAULT_FN_ATTRS
+static __inline __m128d __DEFAULT_FN_ATTRS512
 _mm512_castpd512_pd128(__m512d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1);
 }
 
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS512
 _mm512_castpd512_pd256 (__m512d __A)
 {
   return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
 }
 
-static __inline __m128 __DEFAULT_FN_ATTRS
+static __inline __m128 __DEFAULT_FN_ATTRS512
 _mm512_castps512_ps128(__m512 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
 }
 
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS512
 _mm512_castps512_ps256 (__m512 __A)
 {
   return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_castpd_ps (__m512d __A)
 {
   return (__m512) (__A);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_castpd_si512 (__m512d __A)
 {
   return (__m512i) (__A);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_castpd128_pd512 (__m128d __A)
 {
   return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_castps_pd (__m512 __A)
 {
   return (__m512d) (__A);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_castps_si512 (__m512 __A)
 {
   return (__m512i) (__A);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_castps128_ps512 (__m128 __A)
 {
     return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_castsi128_si512 (__m128i __A)
 {
    return  __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_castsi256_si512 (__m256i __A)
 {
    return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_castsi512_ps (__m512i __A)
 {
   return (__m512) (__A);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_castsi512_pd (__m512i __A)
 {
   return (__m512d) (__A);
 }
 
-static __inline __m128i __DEFAULT_FN_ATTRS
+static __inline __m128i __DEFAULT_FN_ATTRS512
 _mm512_castsi512_si128 (__m512i __A)
 {
   return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
 }
 
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS512
 _mm512_castsi512_si256 (__m512i __A)
 {
   return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
 }
 
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_int2mask(int __a)
+{
+  return (__mmask16)__a;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask2int(__mmask16 __a)
+{
+  return (int)__a;
+}
+
+/// Constructs a 512-bit floating-point vector of [8 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The upper 384 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
+///    contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_zextpd128_pd512(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
+}
+
+/// Constructs a 512-bit floating-point vector of [8 x double] from a
+///    256-bit floating-point vector of [4 x double]. The lower 256 bits
+///    contain the value of the source vector. The upper 256 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
+///    contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_zextpd256_pd512(__m256d __a)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// Constructs a 512-bit floating-point vector of [16 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The upper 384 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
+///    contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_zextps128_ps512(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
+}
+
+/// Constructs a 512-bit floating-point vector of [16 x float] from a
+///    256-bit floating-point vector of [8 x float]. The lower 256 bits contain
+///    the value of the source vector. The upper 256 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
+///    contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_zextps256_ps512(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+/// Constructs a 512-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The upper
+///    384 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The upper 384 bits are set to zero.
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_zextsi128_si512(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
+}
+
+/// Constructs a 512-bit integer vector from a 256-bit integer vector.
+///    The lower 256 bits contain the value of the source vector. The upper
+///    256 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
+///    the parameter. The upper 256 bits are set to zero.
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_zextsi256_si512(__m256i __a)
+{
+  return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
 /* Bitwise operators */
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_and_epi32(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v16su)__a & (__v16su)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
@@ -518,20 +645,20 @@
                 (__v16si) __src);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
                                          __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_and_epi64(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a & (__v8du)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
     return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
@@ -539,26 +666,26 @@
                 (__v8di) __src);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
                                          __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_andnot_si512 (__m512i __A, __m512i __B)
 {
-  return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
+  return (__m512i)(~(__v8du)__A & (__v8du)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_andnot_epi32 (__m512i __A, __m512i __B)
 {
-  return (__m512i)(~(__v16su)(__A) & (__v16su)__B);
+  return (__m512i)(~(__v16su)__A & (__v16su)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
@@ -566,20 +693,20 @@
                                          (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
                                            __U, __A, __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_andnot_epi64(__m512i __A, __m512i __B)
 {
-  return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
+  return (__m512i)(~(__v8du)__A & (__v8du)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
@@ -587,20 +714,20 @@
                                           (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
                                            __U, __A, __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_or_epi32(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v16su)__a | (__v16su)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
@@ -608,19 +735,19 @@
                                              (__v16si)__src);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_or_epi64(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a | (__v8du)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
@@ -628,19 +755,19 @@
                                              (__v8di)__src);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_xor_epi32(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v16su)__a ^ (__v16su)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
@@ -648,19 +775,19 @@
                                             (__v16si)__src);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_xor_epi64(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a ^ (__v8du)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
@@ -668,25 +795,25 @@
                                              (__v8di)__src);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_and_si512(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a & (__v8du)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_or_si512(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a | (__v8du)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_xor_si512(__m512i __a, __m512i __b)
 {
   return (__m512i)((__v8du)__a ^ (__v8du)__b);
@@ -694,243 +821,205 @@
 
 /* Arithmetic */
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_add_pd(__m512d __a, __m512d __b)
 {
   return (__m512d)((__v8df)__a + (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_add_ps(__m512 __a, __m512 __b)
 {
   return (__m512)((__v16sf)__a + (__v16sf)__b);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_mul_pd(__m512d __a, __m512d __b)
 {
   return (__m512d)((__v8df)__a * (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_mul_ps(__m512 __a, __m512 __b)
 {
   return (__m512)((__v16sf)__a * (__v16sf)__b);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_sub_pd(__m512d __a, __m512d __b)
 {
   return (__m512d)((__v8df)__a - (__v8df)__b);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_sub_ps(__m512 __a, __m512 __b)
 {
   return (__m512)((__v16sf)__a - (__v16sf)__b);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_add_epi64 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v8du) __A + (__v8du) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
-             (__v8di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_add_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
-             (__v8di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_add_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_sub_epi64 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v8du) __A - (__v8du) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
-             (__v8di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sub_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
-             (__v8di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sub_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_add_epi32 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v16su) __A + (__v16su) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
-             (__v16si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_add_epi32(__A, __B),
+                                             (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
-             (__v16si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_add_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_sub_epi32 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v16su) __A - (__v16su) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
-             (__v16si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_sub_epi32(__A, __B),
+                                             (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
-             (__v16si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_sub_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
-#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+#define _mm512_max_round_pd(A, B, R) \
+  (__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), (int)(R))
 
-#define _mm512_maskz_max_round_pd(U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(U), (int)(R)); })
+#define _mm512_mask_max_round_pd(W, U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_max_round_pd((A), (B), (R)), \
+                                   (__v8df)(W))
 
-#define _mm512_max_round_pd(A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_undefined_pd(), \
-                                        (__mmask8)-1, (int)(R)); })
+#define _mm512_maskz_max_round_pd(U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_max_round_pd((A), (B), (R)), \
+                                   (__v8df)_mm512_setzero_pd())
 
-static  __inline__ __m512d __DEFAULT_FN_ATTRS
+static  __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_max_pd(__m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) -1,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
+                                           _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
-                  (__v8df) __B,
-                  (__v8df) __W,
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                              (__v8df)_mm512_max_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
-                  (__v8df) __B,
-                  (__v8df)
-                  _mm512_setzero_pd (),
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                              (__v8df)_mm512_max_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-#define _mm512_mask_max_round_ps(W, U, A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                       (int)(R)); })
+#define _mm512_max_round_ps(A, B, R) \
+  (__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
+                                  (__v16sf)(__m512)(B), (int)(R))
 
-#define _mm512_maskz_max_round_ps(U, A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(U), (int)(R)); })
+#define _mm512_mask_max_round_ps(W, U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
+                                  (__v16sf)(W))
 
-#define _mm512_max_round_ps(A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_undefined_ps(), \
-                                       (__mmask16)-1, (int)(R)); })
+#define _mm512_maskz_max_round_ps(U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
+                                  (__v16sf)_mm512_setzero_ps())
 
-static  __inline__ __m512 __DEFAULT_FN_ATTRS
+static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_max_ps(__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) -1,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
+                                          _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
-                 (__v16sf) __B,
-                 (__v16sf) __W,
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                             (__v16sf)_mm512_max_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
-                 (__v16sf) __B,
-                 (__v16sf)
-                 _mm512_setzero_ps (),
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                             (__v16sf)_mm512_max_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
@@ -939,7 +1028,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
@@ -948,25 +1037,25 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_max_round_ss(A, B, R) __extension__ ({ \
+#define _mm_max_round_ss(A, B, R) \
   (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm_mask_max_round_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_max_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm_maskz_max_round_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_max_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
@@ -975,7 +1064,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
@@ -984,238 +1073,188 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_max_round_sd(A, B, R) __extension__ ({ \
+#define _mm_max_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm_mask_max_round_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_max_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_max_round_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_max_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
 static __inline __m512i
-__DEFAULT_FN_ATTRS
+__DEFAULT_FN_ATTRS512
 _mm512_max_epi32(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_pmaxsd512((__v16si)__A, (__v16si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
-                   (__v16si) __B,
-                   (__v16si) __W, __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                            (__v16si)_mm512_max_epi32(__A, __B),
+                                            (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
-                   (__v16si) __B,
-                   (__v16si)
-                   _mm512_setzero_si512 (),
-                   __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                            (__v16si)_mm512_max_epi32(__A, __B),
+                                            (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_max_epu32(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_pmaxud512((__v16si)__A, (__v16si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
-                   (__v16si) __B,
-                   (__v16si) __W, __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                            (__v16si)_mm512_max_epu32(__A, __B),
+                                            (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
-                   (__v16si) __B,
-                   (__v16si)
-                   _mm512_setzero_si512 (),
-                   __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                            (__v16si)_mm512_max_epu32(__A, __B),
+                                            (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_max_epi64(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pmaxsq512((__v8di)__A, (__v8di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
-                   (__v8di) __B,
-                   (__v8di) __W, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_max_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
-                   (__v8di) __B,
-                   (__v8di)
-                   _mm512_setzero_si512 (),
-                   __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_max_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_max_epu64(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pmaxuq512((__v8di)__A, (__v8di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
-                   (__v8di) __B,
-                   (__v8di) __W, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_max_epu64(__A, __B),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
-                   (__v8di) __B,
-                   (__v8di)
-                   _mm512_setzero_si512 (),
-                   __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_max_epu64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-#define _mm512_mask_min_round_pd(W, U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+#define _mm512_min_round_pd(A, B, R) \
+  (__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), (int)(R))
 
-#define _mm512_maskz_min_round_pd(U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(U), (int)(R)); })
+#define _mm512_mask_min_round_pd(W, U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_min_round_pd((A), (B), (R)), \
+                                   (__v8df)(W))
 
-#define _mm512_min_round_pd(A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_undefined_pd(), \
-                                        (__mmask8)-1, (int)(R)); })
+#define _mm512_maskz_min_round_pd(U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_min_round_pd((A), (B), (R)), \
+                                   (__v8df)_mm512_setzero_pd())
 
-static  __inline__ __m512d __DEFAULT_FN_ATTRS
+static  __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_min_pd(__m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) -1,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
+                                           _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
-                  (__v8df) __B,
-                  (__v8df) __W,
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                              (__v8df)_mm512_min_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
-#define _mm512_mask_min_round_ps(W, U, A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                       (int)(R)); })
-
-#define _mm512_maskz_min_round_ps(U, A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(U), (int)(R)); })
-
-#define _mm512_min_round_ps(A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_undefined_ps(), \
-                                       (__mmask16)-1, (int)(R)); })
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
-                  (__v8df) __B,
-                  (__v8df)
-                  _mm512_setzero_pd (),
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                              (__v8df)_mm512_min_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-static  __inline__ __m512 __DEFAULT_FN_ATTRS
+#define _mm512_min_round_ps(A, B, R) \
+  (__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
+                                  (__v16sf)(__m512)(B), (int)(R))
+
+#define _mm512_mask_min_round_ps(W, U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
+                                  (__v16sf)(W))
+
+#define _mm512_maskz_min_round_ps(U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
+                                  (__v16sf)_mm512_setzero_ps())
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_min_ps(__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) -1,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
+                                          _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
-                 (__v16sf) __B,
-                 (__v16sf) __W,
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                             (__v16sf)_mm512_min_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
-                 (__v16sf) __B,
-                 (__v16sf)
-                 _mm512_setzero_ps (),
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                             (__v16sf)_mm512_min_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
@@ -1224,7 +1263,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
                 (__v4sf) __B,
@@ -1233,25 +1272,25 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_min_round_ss(A, B, R) __extension__ ({ \
+#define _mm_min_round_ss(A, B, R) \
   (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm_mask_min_round_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_min_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm_maskz_min_round_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_min_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
@@ -1260,7 +1299,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
                 (__v2df) __B,
@@ -1269,303 +1308,264 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_min_round_sd(A, B, R) __extension__ ({ \
+#define _mm_min_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm_mask_min_round_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_min_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_min_round_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_min_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
 static __inline __m512i
-__DEFAULT_FN_ATTRS
+__DEFAULT_FN_ATTRS512
 _mm512_min_epi32(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_pminsd512((__v16si)__A, (__v16si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
-                   (__v16si) __B,
-                   (__v16si) __W, __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                            (__v16si)_mm512_min_epi32(__A, __B),
+                                            (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
-                   (__v16si) __B,
-                   (__v16si)
-                   _mm512_setzero_si512 (),
-                   __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                            (__v16si)_mm512_min_epi32(__A, __B),
+                                            (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_min_epu32(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_pminud512((__v16si)__A, (__v16si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
-                   (__v16si) __B,
-                   (__v16si) __W, __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                            (__v16si)_mm512_min_epu32(__A, __B),
+                                            (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
-                   (__v16si) __B,
-                   (__v16si)
-                   _mm512_setzero_si512 (),
-                   __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                            (__v16si)_mm512_min_epu32(__A, __B),
+                                            (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_min_epi64(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pminsq512((__v8di)__A, (__v8di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
-                   (__v8di) __B,
-                   (__v8di) __W, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_min_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
-                   (__v8di) __B,
-                   (__v8di)
-                   _mm512_setzero_si512 (),
-                   __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_min_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_min_epu64(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pminuq512((__v8di)__A, (__v8di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
-                   (__v8di) __B,
-                   (__v8di) __W, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_min_epu64(__A, __B),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
-                   (__v8di) __B,
-                   (__v8di)
-                   _mm512_setzero_si512 (),
-                   __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_min_epu64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_mul_epi32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v8di) __W, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epi32(__X, __Y),
+                                             (__v8di)__W);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epi32(__X, __Y),
+                                             (__v8di)_mm512_setzero_si512 ());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_mul_epu32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
-               (__v16si) __Y,
-               (__v8di)
-               _mm512_setzero_si512 (),
-               (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
-               (__v16si) __Y,
-               (__v8di) __W, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epu32(__X, __Y),
+                                             (__v8di)__W);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
-               (__v16si) __Y,
-               (__v8di)
-               _mm512_setzero_si512 (),
-               __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epu32(__X, __Y),
+                                             (__v8di)_mm512_setzero_si512 ());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_mullo_epi32 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v16su) __A * (__v16su) __B);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_mullo_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si) __W, __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_mullo_epi32(__A, __B),
+                                             (__v16si)__W);
 }
 
-#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                         (int)(R)); })
-
-#define _mm512_maskz_sqrt_round_pd(U, A, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)(U), (int)(R)); })
-
-#define _mm512_sqrt_round_pd(A, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)_mm512_undefined_pd(), \
-                                         (__mmask8)-1, (int)(R)); })
-
-static  __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_sqrt_pd(__m512d __a)
-{
-  return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a,
-                                                (__v8df) _mm512_setzero_pd (),
-                                                (__mmask8) -1,
-                                                _MM_FROUND_CUR_DIRECTION);
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mullox_epi64 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v8du) __A * (__v8du) __B);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullox_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+#define _mm512_sqrt_round_pd(A, R) \
+  (__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R))
+
+#define _mm512_mask_sqrt_round_pd(W, U, A, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_sqrt_round_pd((A), (R)), \
+                                       (__v8df)(__m512d)(W))
+
+#define _mm512_maskz_sqrt_round_pd(U, A, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_sqrt_round_pd((A), (R)), \
+                                       (__v8df)_mm512_setzero_pd())
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_sqrt_pd(__m512d __A)
+{
+  return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
-                   (__v8df) __W,
-                   (__mmask8) __U,
-                   _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                              (__v8df)_mm512_sqrt_pd(__A),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
 {
-  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
-                   (__v8df)
-                   _mm512_setzero_pd (),
-                   (__mmask8) __U,
-                   _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                              (__v8df)_mm512_sqrt_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-#define _mm512_mask_sqrt_round_ps(W, U, A, R) __extension__ ({ \
-  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
-                                        (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                        (int)(R)); })
+#define _mm512_sqrt_round_ps(A, R) \
+  (__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R))
 
-#define _mm512_maskz_sqrt_round_ps(U, A, R) __extension__ ({ \
-  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
-                                        (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)(U), (int)(R)); })
+#define _mm512_mask_sqrt_round_ps(W, U, A, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
+                                      (__v16sf)(__m512)(W))
 
-#define _mm512_sqrt_round_ps(A, R) __extension__ ({ \
-  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
-                                        (__v16sf)_mm512_undefined_ps(), \
-                                        (__mmask16)-1, (int)(R)); })
+#define _mm512_maskz_sqrt_round_ps(U, A, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
+                                      (__v16sf)_mm512_setzero_ps())
 
-static  __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_sqrt_ps(__m512 __a)
+static  __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_sqrt_ps(__m512 __A)
 {
-  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a,
-                                               (__v16sf) _mm512_setzero_ps (),
-                                               (__mmask16) -1,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
+                                          _MM_FROUND_CUR_DIRECTION);
 }
 
-static  __inline__ __m512 __DEFAULT_FN_ATTRS
+static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
 {
-  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
-                                               (__v16sf) __W,
-                                               (__mmask16) __U,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                             (__v16sf)_mm512_sqrt_ps(__A),
+                                             (__v16sf)__W);
 }
 
-static  __inline__ __m512 __DEFAULT_FN_ATTRS
+static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
 {
-  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
-                                               (__v16sf) _mm512_setzero_ps (),
-                                               (__mmask16) __U,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                             (__v16sf)_mm512_sqrt_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-static  __inline__ __m512d __DEFAULT_FN_ATTRS
+static  __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_rsqrt14_pd(__m512d __A)
 {
   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
@@ -1573,7 +1573,7 @@
                  _mm512_setzero_pd (),
                  (__mmask8) -1);}
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
@@ -1581,7 +1581,7 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
@@ -1590,7 +1590,7 @@
                   (__mmask8) __U);
 }
 
-static  __inline__ __m512 __DEFAULT_FN_ATTRS
+static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_rsqrt14_ps(__m512 __A)
 {
   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
@@ -1599,7 +1599,7 @@
                 (__mmask16) -1);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
@@ -1607,7 +1607,7 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
@@ -1616,7 +1616,7 @@
                  (__mmask16) __U);
 }
 
-static  __inline__ __m128 __DEFAULT_FN_ATTRS
+static  __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_rsqrt14_ss(__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
@@ -1626,7 +1626,7 @@
              (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
@@ -1635,7 +1635,7 @@
           (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
@@ -1644,7 +1644,7 @@
           (__mmask8) __U);
 }
 
-static  __inline__ __m128d __DEFAULT_FN_ATTRS
+static  __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_rsqrt14_sd(__m128d __A, __m128d __B)
 {
   return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
@@ -1654,7 +1654,7 @@
               (__mmask8) -1);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
@@ -1663,7 +1663,7 @@
           (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
@@ -1672,7 +1672,7 @@
           (__mmask8) __U);
 }
 
-static  __inline__ __m512d __DEFAULT_FN_ATTRS
+static  __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_rcp14_pd(__m512d __A)
 {
   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
@@ -1681,7 +1681,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
@@ -1689,7 +1689,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
@@ -1698,7 +1698,7 @@
                 (__mmask8) __U);
 }
 
-static  __inline__ __m512 __DEFAULT_FN_ATTRS
+static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_rcp14_ps(__m512 __A)
 {
   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
@@ -1707,7 +1707,7 @@
               (__mmask16) -1);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
@@ -1715,7 +1715,7 @@
                    (__mmask16) __U);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
@@ -1724,7 +1724,7 @@
                    (__mmask16) __U);
 }
 
-static  __inline__ __m128 __DEFAULT_FN_ATTRS
+static  __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_rcp14_ss(__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
@@ -1734,7 +1734,7 @@
                  (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
@@ -1743,7 +1743,7 @@
           (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
@@ -1752,7 +1752,7 @@
           (__mmask8) __U);
 }
 
-static  __inline__ __m128d __DEFAULT_FN_ATTRS
+static  __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_rcp14_sd(__m128d __A, __m128d __B)
 {
   return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
@@ -1762,7 +1762,7 @@
             (__mmask8) -1);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
@@ -1771,7 +1771,7 @@
           (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
@@ -1780,7 +1780,7 @@
           (__mmask8) __U);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_floor_ps(__m512 __A)
 {
   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
@@ -1789,7 +1789,7 @@
                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
@@ -1798,7 +1798,7 @@
                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_floor_pd(__m512d __A)
 {
   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
@@ -1807,7 +1807,7 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
@@ -1816,7 +1816,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
@@ -1825,7 +1825,7 @@
                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_ceil_ps(__m512 __A)
 {
   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
@@ -1834,7 +1834,7 @@
                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_ceil_pd(__m512d __A)
 {
   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
@@ -1843,7 +1843,7 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
@@ -1852,796 +1852,672 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_abs_epi64(__m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_pabsq512((__v8di)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
-                  (__v8di) __W,
-                  (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_abs_epi64(__A),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
-                  (__v8di)
-                  _mm512_setzero_si512 (),
-                  (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_abs_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_abs_epi32(__m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i)__builtin_ia32_pabsd512((__v16si) __A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
-                  (__v16si) __W,
-                  (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                             (__v16si)_mm512_abs_epi32(__A),
+                                             (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
-                  (__v16si)
-                  _mm512_setzero_si512 (),
-                  (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                             (__v16si)_mm512_abs_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_add_ss(__A, __B);
+  return __builtin_ia32_selectss_128(__U, __A, __W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_add_ss(__A, __B);
+  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
 }
 
-#define _mm_add_round_ss(A, B, R) __extension__ ({ \
+#define _mm_add_round_ss(A, B, R) \
   (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm_mask_add_round_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_add_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm_maskz_add_round_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_add_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_add_sd(__A, __B);
+  return __builtin_ia32_selectsd_128(__U, __A, __W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_add_sd(__A, __B);
+  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
 }
-#define _mm_add_round_sd(A, B, R) __extension__ ({ \
+#define _mm_add_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm_mask_add_round_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_add_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_add_round_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_add_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_add_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) _mm512_setzero_pd (),
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_add_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_add_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) _mm512_setzero_ps (),
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_add_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-#define _mm512_add_round_pd(A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)); })
+#define _mm512_add_round_pd(A, B, R) \
+  (__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), (int)(R))
 
-#define _mm512_mask_add_round_pd(W, U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+#define _mm512_mask_add_round_pd(W, U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_add_round_pd((A), (B), (R)), \
+                                   (__v8df)(__m512d)(W));
 
-#define _mm512_maskz_add_round_pd(U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(U), (int)(R)); })
+#define _mm512_maskz_add_round_pd(U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_add_round_pd((A), (B), (R)), \
+                                   (__v8df)_mm512_setzero_pd());
 
-#define _mm512_add_round_ps(A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)); })
+#define _mm512_add_round_ps(A, B, R) \
+  (__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
+                                  (__v16sf)(__m512)(B), (int)(R))
 
-#define _mm512_mask_add_round_ps(W, U, A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                       (int)(R)); })
+#define _mm512_mask_add_round_ps(W, U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
+                                  (__v16sf)(__m512)(W));
 
-#define _mm512_maskz_add_round_ps(U, A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(U), (int)(R)); })
+#define _mm512_maskz_add_round_ps(U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
+                                  (__v16sf)_mm512_setzero_ps());
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_sub_ss(__A, __B);
+  return __builtin_ia32_selectss_128(__U, __A, __W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_sub_ss(__A, __B);
+  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
 }
-#define _mm_sub_round_ss(A, B, R) __extension__ ({ \
+#define _mm_sub_round_ss(A, B, R) \
   (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm_mask_sub_round_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_sub_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm_maskz_sub_round_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_sub_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_sub_sd(__A, __B);
+  return __builtin_ia32_selectsd_128(__U, __A, __W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_sub_sd(__A, __B);
+  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
 }
 
-#define _mm_sub_round_sd(A, B, R) __extension__ ({ \
+#define _mm_sub_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm_mask_sub_round_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_sub_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_sub_round_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_sub_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_sub_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_sub_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_sub_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_sub_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)); })
+#define _mm512_sub_round_pd(A, B, R) \
+  (__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), (int)(R))
 
-#define _mm512_mask_sub_round_pd(W, U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+#define _mm512_mask_sub_round_pd(W, U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
+                                   (__v8df)(__m512d)(W));
 
-#define _mm512_maskz_sub_round_pd(U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(U), (int)(R)); })
+#define _mm512_maskz_sub_round_pd(U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
+                                   (__v8df)_mm512_setzero_pd());
 
-#define _mm512_sub_round_ps(A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)); })
+#define _mm512_sub_round_ps(A, B, R) \
+  (__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
+                                  (__v16sf)(__m512)(B), (int)(R))
 
-#define _mm512_mask_sub_round_ps(W, U, A, B, R)  __extension__ ({ \
-  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                       (int)(R)); });
+#define _mm512_mask_sub_round_ps(W, U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
+                                  (__v16sf)(__m512)(W));
 
-#define _mm512_maskz_sub_round_ps(U, A, B, R)  __extension__ ({ \
-  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(U), (int)(R)); });
+#define _mm512_maskz_sub_round_ps(U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
+                                  (__v16sf)_mm512_setzero_ps());
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_mul_ss(__A, __B);
+  return __builtin_ia32_selectss_128(__U, __A, __W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_mul_ss(__A, __B);
+  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
 }
-#define _mm_mul_round_ss(A, B, R) __extension__ ({ \
+#define _mm_mul_round_ss(A, B, R) \
   (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm_mask_mul_round_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_mul_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm_maskz_mul_round_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_mul_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_mul_sd(__A, __B);
+  return __builtin_ia32_selectsd_128(__U, __A, __W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_mul_sd(__A, __B);
+  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
 }
 
-#define _mm_mul_round_sd(A, B, R) __extension__ ({ \
+#define _mm_mul_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm_mask_mul_round_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_mul_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_mul_round_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_mul_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_mul_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_mul_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_mul_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_mul_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)); })
+#define _mm512_mul_round_pd(A, B, R) \
+  (__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), (int)(R))
 
-#define _mm512_mask_mul_round_pd(W, U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+#define _mm512_mask_mul_round_pd(W, U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
+                                   (__v8df)(__m512d)(W));
 
-#define _mm512_maskz_mul_round_pd(U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(U), (int)(R)); })
+#define _mm512_maskz_mul_round_pd(U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
+                                   (__v8df)_mm512_setzero_pd());
 
-#define _mm512_mul_round_ps(A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)); })
+#define _mm512_mul_round_ps(A, B, R) \
+  (__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
+                                  (__v16sf)(__m512)(B), (int)(R))
 
-#define _mm512_mask_mul_round_ps(W, U, A, B, R)  __extension__ ({ \
-  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                       (int)(R)); });
+#define _mm512_mask_mul_round_ps(W, U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
+                                  (__v16sf)(__m512)(W));
 
-#define _mm512_maskz_mul_round_ps(U, A, B, R)  __extension__ ({ \
-  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(U), (int)(R)); });
+#define _mm512_maskz_mul_round_ps(U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
+                                  (__v16sf)_mm512_setzero_ps());
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_div_ss(__A, __B);
+  return __builtin_ia32_selectss_128(__U, __A, __W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_div_ss(__A, __B);
+  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
 }
 
-#define _mm_div_round_ss(A, B, R) __extension__ ({ \
+#define _mm_div_round_ss(A, B, R) \
   (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm_mask_div_round_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_div_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm_maskz_div_round_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_div_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_div_sd(__A, __B);
+  return __builtin_ia32_selectsd_128(__U, __A, __W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_div_sd(__A, __B);
+  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
 }
 
-#define _mm_div_round_sd(A, B, R) __extension__ ({ \
+#define _mm_div_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm_mask_div_round_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_div_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_div_round_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_div_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_div_pd(__m512d __a, __m512d __b)
 {
   return (__m512d)((__v8df)__a/(__v8df)__b);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df) __W,
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_div_pd(__A, __B),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
-             (__v8df) __B,
-             (__v8df)
-             _mm512_setzero_pd (),
-             (__mmask8) __U,
-             _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_div_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_div_ps(__m512 __a, __m512 __b)
 {
   return (__m512)((__v16sf)__a/(__v16sf)__b);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf) __W,
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_div_ps(__A, __B),
+                                             (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
-            (__v16sf) __B,
-            (__v16sf)
-            _mm512_setzero_ps (),
-            (__mmask16) __U,
-            _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_div_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-#define _mm512_div_round_pd(A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)); })
+#define _mm512_div_round_pd(A, B, R) \
+  (__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), (int)(R))
 
-#define _mm512_mask_div_round_pd(W, U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+#define _mm512_mask_div_round_pd(W, U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_div_round_pd((A), (B), (R)), \
+                                   (__v8df)(__m512d)(W));
 
-#define _mm512_maskz_div_round_pd(U, A, B, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(B), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(U), (int)(R)); })
+#define _mm512_maskz_div_round_pd(U, A, B, R) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_div_round_pd((A), (B), (R)), \
+                                   (__v8df)_mm512_setzero_pd());
 
-#define _mm512_div_round_ps(A, B, R) __extension__ ({ \
-  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)); })
+#define _mm512_div_round_ps(A, B, R) \
+  (__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
+                                  (__v16sf)(__m512)(B), (int)(R))
 
-#define _mm512_mask_div_round_ps(W, U, A, B, R)  __extension__ ({ \
-  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                       (int)(R)); });
+#define _mm512_mask_div_round_ps(W, U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
+                                  (__v16sf)(__m512)(W));
 
-#define _mm512_maskz_div_round_ps(U, A, B, R)  __extension__ ({ \
-  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(B), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(U), (int)(R)); });
+#define _mm512_maskz_div_round_ps(U, A, B, R) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
+                                  (__v16sf)_mm512_setzero_ps());
 
-#define _mm512_roundscale_ps(A, B) __extension__ ({ \
+#define _mm512_roundscale_ps(A, B) \
   (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
-                                         (__v16sf)(__m512)(A), (__mmask16)-1, \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         (__v16sf)_mm512_undefined_ps(), \
+                                         (__mmask16)-1, \
+                                         _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_roundscale_ps(A, B, C, imm) __extension__ ({\
+#define _mm512_mask_roundscale_ps(A, B, C, imm) \
   (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_roundscale_ps(A, B, imm) __extension__ ({\
+#define _mm512_maskz_roundscale_ps(A, B, imm) \
   (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
                                          (__v16sf)_mm512_setzero_ps(), \
                                          (__mmask16)(A), \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) __extension__ ({ \
+#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
   (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
-                                         (int)(R)); })
+                                         (int)(R))
 
-#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) __extension__ ({ \
+#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
   (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
                                          (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(A), (int)(R)); })
+                                         (__mmask16)(A), (int)(R))
 
-#define _mm512_roundscale_round_ps(A, imm, R) __extension__ ({ \
+#define _mm512_roundscale_round_ps(A, imm, R) \
   (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
                                          (__v16sf)_mm512_undefined_ps(), \
-                                         (__mmask16)-1, (int)(R)); })
+                                         (__mmask16)-1, (int)(R))
 
-#define _mm512_roundscale_pd(A, B) __extension__ ({ \
+#define _mm512_roundscale_pd(A, B) \
   (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
-                                          (__v8df)(__m512d)(A), (__mmask8)-1, \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          (__v8df)_mm512_undefined_pd(), \
+                                          (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_roundscale_pd(A, B, C, imm) __extension__ ({\
+#define _mm512_mask_roundscale_pd(A, B, C, imm) \
   (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_roundscale_pd(A, B, imm) __extension__ ({\
+#define _mm512_maskz_roundscale_pd(A, B, imm) \
   (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
                                           (__v8df)_mm512_setzero_pd(), \
                                           (__mmask8)(A), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) __extension__ ({ \
+#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
   (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) __extension__ ({ \
+#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
   (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
                                           (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(A), (int)(R)); })
+                                          (__mmask8)(A), (int)(R))
 
-#define _mm512_roundscale_round_pd(A, imm, R) __extension__ ({ \
+#define _mm512_roundscale_round_pd(A, imm, R) \
   (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
                                           (__v8df)_mm512_undefined_pd(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
-                                           (int)(R)); })
-
-
-#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+#define _mm512_fmadd_round_pd(A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(B), \
                                            (__v8df)(__m512d)(C), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
 
-#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R))
+
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
                                             (__v8df)(__m512d)(B), \
                                             (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
-#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
                                             (__v8df)(__m512d)(B), \
                                             (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
-#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
+#define _mm512_fmsub_round_pd(A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(B), \
                                            -(__v8df)(__m512d)(C), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
 
-#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(B), \
                                            -(__v8df)(__m512d)(C), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
 
-#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
                                             (__v8df)(__m512d)(B), \
                                             -(__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
-#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
+#define _mm512_fnmadd_round_pd(A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
-                                           (int)(R)); })
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R))
 
 
-#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
                                             (__v8df)(__m512d)(B), \
                                             (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
-#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
                                             (__v8df)(__m512d)(B), \
                                             (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
-#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
+#define _mm512_fnmsub_round_pd(A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(B), \
                                            -(__v8df)(__m512d)(C), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
 
-#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
                                             (__v8df)(__m512d)(B), \
                                             -(__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
@@ -2651,7 +2527,7 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
@@ -2661,7 +2537,7 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
@@ -2671,7 +2547,7 @@
                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
@@ -2681,7 +2557,7 @@
                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
@@ -2691,7 +2567,7 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
@@ -2701,7 +2577,7 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
@@ -2711,17 +2587,17 @@
                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
-                                                    (__v8df) __B,
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    -(__v8df) __B,
                                                     (__v8df) __C,
                                                     (__mmask8) -1,
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
@@ -2731,7 +2607,7 @@
                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
@@ -2741,17 +2617,17 @@
                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
-                                                    (__v8df) __B,
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    -(__v8df) __B,
                                                     -(__v8df) __C,
                                                     (__mmask8) -1,
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
@@ -2761,91 +2637,91 @@
                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
-                                          (int)(R)); })
-
-
-#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+#define _mm512_fmadd_round_ps(A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(B), \
                                           (__v16sf)(__m512)(C), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)-1, (int)(R))
 
 
-#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R))
+
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
   (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
                                            (__v16sf)(__m512)(B), \
                                            (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
 
-#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
                                            (__v16sf)(__m512)(B), \
                                            (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
 
-#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
+#define _mm512_fmsub_round_ps(A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(B), \
                                           -(__v16sf)(__m512)(C), \
-                                          (__mmask16)-1, (int)(R)); })
+                                          (__mmask16)-1, (int)(R))
 
 
-#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
   (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(B), \
                                           -(__v16sf)(__m512)(C), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
 
-#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
                                            (__v16sf)(__m512)(B), \
                                            -(__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
 
-#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
-                                          (int)(R)); })
+#define _mm512_fnmadd_round_ps(A, B, C, R) \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          -(__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R))
 
 
-#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
   (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
                                            (__v16sf)(__m512)(B), \
                                            (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
 
-#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
                                            (__v16sf)(__m512)(B), \
                                            (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
 
-#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
+#define _mm512_fnmsub_round_ps(A, B, C, R) \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          -(__v16sf)(__m512)(B), \
                                           -(__v16sf)(__m512)(C), \
-                                          (__mmask16)-1, (int)(R)); })
+                                          (__mmask16)-1, (int)(R))
 
 
-#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
                                            (__v16sf)(__m512)(B), \
                                            -(__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
@@ -2855,7 +2731,7 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
@@ -2865,7 +2741,7 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
   return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
@@ -2875,7 +2751,7 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
@@ -2885,7 +2761,7 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
@@ -2895,7 +2771,7 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
@@ -2905,7 +2781,7 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
@@ -2915,17 +2791,17 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
-                                                   (__v16sf) __B,
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   -(__v16sf) __B,
                                                    (__v16sf) __C,
                                                    (__mmask16) -1,
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
   return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
@@ -2935,7 +2811,7 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
@@ -2945,17 +2821,17 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
-                                                   (__v16sf) __B,
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   -(__v16sf) __B,
                                                    -(__v16sf) __C,
                                                    (__mmask16) -1,
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
@@ -2965,96 +2841,96 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
+#define _mm512_fmaddsub_round_pd(A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
                                               (__v8df)(__m512d)(B), \
                                               (__v8df)(__m512d)(C), \
-                                              (__mmask8)-1, (int)(R)); })
+                                              (__mmask8)-1, (int)(R))
 
 
-#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
                                               (__v8df)(__m512d)(B), \
                                               (__v8df)(__m512d)(C), \
-                                              (__mmask8)(U), (int)(R)); })
+                                              (__mmask8)(U), (int)(R))
 
 
-#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
   (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
                                                (__v8df)(__m512d)(B), \
                                                (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)); })
+                                               (__mmask8)(U), (int)(R))
 
 
-#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
                                                (__v8df)(__m512d)(B), \
                                                (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)); })
+                                               (__mmask8)(U), (int)(R))
 
 
-#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
+#define _mm512_fmsubadd_round_pd(A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
                                               (__v8df)(__m512d)(B), \
                                               -(__v8df)(__m512d)(C), \
-                                              (__mmask8)-1, (int)(R)); })
+                                              (__mmask8)-1, (int)(R))
 
 
-#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
                                               (__v8df)(__m512d)(B), \
                                               -(__v8df)(__m512d)(C), \
-                                              (__mmask8)(U), (int)(R)); })
+                                              (__mmask8)(U), (int)(R))
 
 
-#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
   (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
                                                (__v8df)(__m512d)(B), \
                                                -(__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)); })
+                                               (__mmask8)(U), (int)(R))
 
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       (__v8df) __C,
-                                                       (__mmask8) -1,
-                                                       _MM_FROUND_CUR_DIRECTION);
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
                                                        (__v8df) __B,
                                                        (__v8df) __C,
                                                        (__mmask8) __U,
                                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
-                                                        (__v8df) __B,
-                                                        (__v8df) __C,
-                                                        (__mmask8) __U,
-                                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
-                                                        (__v8df) __B,
-                                                        (__v8df) __C,
-                                                        (__mmask8) __U,
-                                                        _MM_FROUND_CUR_DIRECTION);
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
@@ -3064,7 +2940,7 @@
                                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
@@ -3074,7 +2950,7 @@
                                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
   return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
@@ -3084,56 +2960,56 @@
                                                         _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
+#define _mm512_fmaddsub_round_ps(A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
                                              (__v16sf)(__m512)(B), \
                                              (__v16sf)(__m512)(C), \
-                                             (__mmask16)-1, (int)(R)); })
+                                             (__mmask16)-1, (int)(R))
 
 
-#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
   (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
                                              (__v16sf)(__m512)(B), \
                                              (__v16sf)(__m512)(C), \
-                                             (__mmask16)(U), (int)(R)); })
+                                             (__mmask16)(U), (int)(R))
 
 
-#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
   (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
                                               (__v16sf)(__m512)(B), \
                                               (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)); })
+                                              (__mmask16)(U), (int)(R))
 
 
-#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
                                               (__v16sf)(__m512)(B), \
                                               (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)); })
+                                              (__mmask16)(U), (int)(R))
 
 
-#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
+#define _mm512_fmsubadd_round_ps(A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
                                              (__v16sf)(__m512)(B), \
                                              -(__v16sf)(__m512)(C), \
-                                             (__mmask16)-1, (int)(R)); })
+                                             (__mmask16)-1, (int)(R))
 
 
-#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
   (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
                                              (__v16sf)(__m512)(B), \
                                              -(__v16sf)(__m512)(C), \
-                                             (__mmask16)(U), (int)(R)); })
+                                             (__mmask16)(U), (int)(R))
 
 
-#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
   (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
                                               (__v16sf)(__m512)(B), \
                                               -(__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)); })
+                                              (__mmask16)(U), (int)(R))
 
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
@@ -3143,7 +3019,7 @@
                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
@@ -3153,7 +3029,7 @@
                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
   return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
@@ -3163,7 +3039,7 @@
                                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
@@ -3173,7 +3049,7 @@
                                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
@@ -3183,7 +3059,7 @@
                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
@@ -3193,7 +3069,7 @@
                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
   return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
@@ -3203,314 +3079,309 @@
                                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
   (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
                                             (__v8df)(__m512d)(B), \
                                             (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
   (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
                                            (__v16sf)(__m512)(B), \
                                            (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
   (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
                                                (__v8df)(__m512d)(B), \
                                                (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)); })
+                                               (__mmask8)(U), (int)(R))
 
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
-                                                        (__v8df) __B,
-                                                        (__v8df) __C,
-                                                        (__mmask8) __U,
-                                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)); })
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
-  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       (__v16sf) __C,
-                                                       (__mmask16) __U,
+  return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
                                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
+  (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R))
 
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
-
-
-#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)); })
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
-  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
-                                                      (__v8df) __B,
-                                                      (__v8df) __C,
-                                                      (__mmask8) __U,
+  return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           -(__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    -(__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          -(__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R))
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   -(__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           -(__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R))
+
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
+  (__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    -(__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          -(__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R))
+
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
+  (__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
                                            (__v16sf)(__m512)(B), \
                                            (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
 
-#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)); })
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   -(__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
                                                     (__v16sf) __B,
                                                     (__v16sf) __C,
                                                     (__mmask16) __U,
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
-  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
-                                                     (__v16sf) __B,
-                                                     (__v16sf) __C,
-                                                     (__mmask16) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
 
 
 /* Vector permutations */
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
-                                                       /* idx */ ,
-                                                       (__v16si) __A,
-                                                       (__v16si) __B,
-                                                       (__mmask16) -1);
+  return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
+                                                (__v16si) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
-                                __m512i __I, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
+                               __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
-                                                        /* idx */ ,
-                                                        (__v16si) __A,
-                                                        (__v16si) __B,
-                                                        (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
+                              (__v16si)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
-                                 __m512i __I, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
-                                                        /* idx */ ,
-                                                        (__v16si) __A,
-                                                        (__v16si) __B,
-                                                        (__mmask16) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
-                                                       /* idx */ ,
-                                                       (__v8di) __A,
-                                                       (__v8di) __B,
-                                                       (__mmask8) -1);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
                                 __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
-                                                       /* idx */ ,
-                                                       (__v8di) __A,
-                                                       (__v8di) __B,
-                                                       (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
+                              (__v16si)__I);
 }
 
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
-         __m512i __I, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
+                                __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
-                                                        /* idx */ ,
-                                                        (__v8di) __A,
-                                                        (__v8di) __B,
-                                                        (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
+                              (__v16si)_mm512_setzero_si512());
 }
 
-#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
-                                         (__v8di)(__m512i)(B), (int)(I), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)-1); })
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
+                                                (__v8di) __B);
+}
 
-#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
-  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
-                                         (__v8di)(__m512i)(B), (int)(imm), \
-                                         (__v8di)(__m512i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
+                               __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
+                               (__v8di)__A);
+}
 
-#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
-  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
-                                         (__v8di)(__m512i)(B), (int)(imm), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
+                                __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
+                               (__v8di)__I);
+}
 
-#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
-                                         (__v16si)(__m512i)(B), (int)(I), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
+                                __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
+                               (__v8di)_mm512_setzero_si512());
+}
 
-#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
-  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
-                                         (__v16si)(__m512i)(B), (int)(imm), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
+#define _mm512_alignr_epi64(A, B, I) \
+  (__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
+                                    (__v8di)(__m512i)(B), (int)(I))
 
-#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
-  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
-                                         (__v16si)(__m512i)(B), (int)(imm), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                 (__v8di)(__m512i)(W))
+
+#define _mm512_maskz_alignr_epi64(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                 (__v8di)_mm512_setzero_si512())
+
+#define _mm512_alignr_epi32(A, B, I) \
+  (__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
+                                    (__v16si)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                (__v16si)(__m512i)(W))
+
+#define _mm512_maskz_alignr_epi32(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                (__v16si)_mm512_setzero_si512())
 /* Vector Extract */
 
-#define _mm512_extractf64x4_pd(A, I) __extension__ ({                    \
+#define _mm512_extractf64x4_pd(A, I) \
   (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
-                                            (__v4df)_mm256_setzero_si256(), \
-                                            (__mmask8)-1); })
+                                            (__v4df)_mm256_undefined_pd(), \
+                                            (__mmask8)-1)
 
-#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
+#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
   (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
                                             (__v4df)(__m256d)(W), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
+#define _mm512_maskz_extractf64x4_pd(U, A, imm) \
   (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
                                             (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_extractf32x4_ps(A, I) __extension__ ({                    \
+#define _mm512_extractf32x4_ps(A, I) \
   (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1); })
+                                           (__v4sf)_mm_undefined_ps(), \
+                                           (__mmask8)-1)
 
-#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
+#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
   (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
                                            (__v4sf)(__m128)(W), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
+#define _mm512_maskz_extractf32x4_ps(U, A, imm) \
   (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
                                            (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
+
 /* Vector Blend */
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
 {
   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
@@ -3518,7 +3389,7 @@
                  (__v8df) __A);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
 {
   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
@@ -3526,7 +3397,7 @@
                 (__v16sf) __A);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
 {
   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
@@ -3534,7 +3405,7 @@
                 (__v8di) __A);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
 {
   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
@@ -3544,57 +3415,135 @@
 
 /* Compare */
 
-#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
+#define _mm512_cmp_round_ps_mask(A, B, P, R) \
   (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(B), (int)(P), \
-                                          (__mmask16)-1, (int)(R)); })
+                                          (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
+#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
   (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(B), (int)(P), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
 #define _mm512_cmp_ps_mask(A, B, P) \
   _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \
   _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
-  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(B), (int)(P), \
-                                         (__mmask8)-1, (int)(R)); })
+#define _mm512_cmpeq_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
 
-#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
+#define _mm512_cmplt_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
+
+#define _mm512_cmp_round_pd_mask(A, B, P, R) \
   (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
                                          (__v8df)(__m512d)(B), (int)(P), \
-                                         (__mmask8)(U), (int)(R)); })
+                                         (__mmask8)-1, (int)(R))
+
+#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), (int)(P), \
+                                         (__mmask8)(U), (int)(R))
 
 #define _mm512_cmp_pd_mask(A, B, P) \
   _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \
   _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
 
+#define _mm512_cmpeq_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
+
 /* Conversion */
 
-#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
+#define _mm512_cvtt_roundps_epu32(A, R) \
   (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
                                              (__v16si)_mm512_undefined_epi32(), \
-                                             (__mmask16)-1, (int)(R)); })
+                                             (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
   (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
                                              (__v16si)(__m512i)(W), \
-                                             (__mmask16)(U), (int)(R)); })
+                                             (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
   (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
                                              (__v16si)_mm512_setzero_si512(), \
-                                             (__mmask16)(U), (int)(R)); })
+                                             (__mmask16)(U), (int)(R))
 
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttps_epu32(__m512 __A)
 {
   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
@@ -3604,7 +3553,7 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
@@ -3613,7 +3562,7 @@
                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
@@ -3622,156 +3571,164 @@
                    _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
+#define _mm512_cvt_roundepi32_ps(A, R) \
   (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, (int)(R)); })
+                                          (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
   (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
                                           (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
   (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
-#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
+#define _mm512_cvt_roundepu32_ps(A, R) \
   (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
                                            (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)-1, (int)(R)); })
+                                           (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
   (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
                                            (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
   (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
                                            (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_cvtepu32_ps (__m512i __A)
 {
-  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
-                 (__v16sf) _mm512_undefined_ps (),
-                 (__mmask16) -1,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
 {
-  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
-                 (__v16sf) __W,
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_cvtepu32_ps(__A),
+                                             (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
 {
-  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
-                 (__v16sf) _mm512_setzero_ps (),
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_cvtepu32_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_cvtepi32_pd(__m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
-                (__v8df)
-                _mm512_setzero_pd (),
-                (__mmask8) -1);
+  return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
-                (__v8df) __W,
-                (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepi32_pd(__A),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
-                (__v8df) _mm512_setzero_pd (),
-                (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepi32_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32lo_pd(__m512i __A)
+{
+  return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+  return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_cvtepi32_ps (__m512i __A)
 {
-  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
-                (__v16sf) _mm512_undefined_ps (),
-                (__mmask16) -1,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
 {
-  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
-                (__v16sf) __W,
-                (__mmask16) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_cvtepi32_ps(__A),
+                                             (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
 {
-  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
-                (__v16sf) _mm512_setzero_ps (),
-                (__mmask16) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_cvtepi32_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_cvtepu32_pd(__m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
-                (__v8df)
-                _mm512_setzero_pd (),
-                (__mmask8) -1);
+  return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
-                  (__v8df) __W,
-                  (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepu32_pd(__A),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
 {
-  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
-                  (__v8df) _mm512_setzero_pd (),
-                  (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepu32_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtepu32lo_pd(__m512i __A)
+{
+  return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+  return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
+}
+
+#define _mm512_cvt_roundpd_ps(A, R) \
   (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
   (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-#define _mm512_maskz_cvt_roundpd_ps(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
   (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
 _mm512_cvtpd_ps (__m512d __A)
 {
   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
@@ -3780,7 +3737,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
 {
   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
@@ -3789,7 +3746,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
 {
   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
@@ -3798,53 +3755,71 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_cvtpd_pslo (__m512d __A)
+{
+  return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
+                (__v8sf) _mm256_setzero_ps (),
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
+{
+  return (__m512) __builtin_shufflevector (
+                (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
+                                               __U, __A),
+                (__v8sf) _mm256_setzero_ps (),
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+#define _mm512_cvt_roundps_ph(A, I) \
   (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
                                             (__v16hi)_mm256_undefined_si256(), \
-                                            (__mmask16)-1); })
+                                            (__mmask16)-1)
 
-#define _mm512_mask_cvt_roundps_ph(U, W, A, I) __extension__ ({ \
+#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
   (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
                                             (__v16hi)(__m256i)(U), \
-                                            (__mmask16)(W)); })
+                                            (__mmask16)(W))
 
-#define _mm512_maskz_cvt_roundps_ph(W, A, I) __extension__ ({ \
+#define _mm512_maskz_cvt_roundps_ph(W, A, I) \
   (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
                                             (__v16hi)_mm256_setzero_si256(), \
-                                            (__mmask16)(W)); })
+                                            (__mmask16)(W))
 
-#define _mm512_cvtps_ph(A, I) __extension__ ({ \
+#define _mm512_cvtps_ph(A, I) \
   (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
                                             (__v16hi)_mm256_setzero_si256(), \
-                                            (__mmask16)-1); })
+                                            (__mmask16)-1)
 
-#define _mm512_mask_cvtps_ph(U, W, A, I) __extension__ ({ \
+#define _mm512_mask_cvtps_ph(U, W, A, I) \
   (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
                                             (__v16hi)(__m256i)(U), \
-                                            (__mmask16)(W)); })
+                                            (__mmask16)(W))
 
-#define _mm512_maskz_cvtps_ph(W, A, I) __extension__ ({\
+#define _mm512_maskz_cvtps_ph(W, A, I) \
   (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
                                             (__v16hi)_mm256_setzero_si256(), \
-                                            (__mmask16)(W)); })
+                                            (__mmask16)(W))
 
-#define _mm512_cvt_roundph_ps(A, R) __extension__ ({ \
+#define _mm512_cvt_roundph_ps(A, R) \
   (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
                                            (__v16sf)_mm512_undefined_ps(), \
-                                           (__mmask16)-1, (int)(R)); })
+                                           (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundph_ps(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
   (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
                                            (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundph_ps(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundph_ps(U, A, R) \
   (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
                                            (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
 
-static  __inline __m512 __DEFAULT_FN_ATTRS
+static  __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_cvtph_ps(__m256i __A)
 {
   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
@@ -3854,7 +3829,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
 {
   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
@@ -3863,7 +3838,7 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
 {
   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
@@ -3872,22 +3847,22 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
+#define _mm512_cvtt_roundpd_epi32(A, R) \
   (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
                                             (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
   (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
                                             (__v8si)(__m256i)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
   (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
                                             (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvttpd_epi32(__m512d __a)
 {
   return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
@@ -3896,7 +3871,7 @@
                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
 {
   return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
@@ -3905,7 +3880,7 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
 {
   return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
@@ -3914,22 +3889,22 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
+#define _mm512_cvtt_roundps_epi32(A, R) \
   (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
                                             (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)-1, (int)(R)); })
+                                            (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
   (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
                                             (__v16si)(__m512i)(W), \
-                                            (__mmask16)(U), (int)(R)); })
+                                            (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
   (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
                                             (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)(U), (int)(R)); })
+                                            (__mmask16)(U), (int)(R))
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttps_epi32(__m512 __a)
 {
   return (__m512i)
@@ -3938,7 +3913,7 @@
                                      (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
@@ -3947,7 +3922,7 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
@@ -3956,22 +3931,22 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
+#define _mm512_cvt_roundps_epi32(A, R) \
   (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
                                            (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)-1, (int)(R)); })
+                                           (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
   (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
                                            (__v16si)(__m512i)(W), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundps_epi32(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
   (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
                                            (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtps_epi32 (__m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
@@ -3980,7 +3955,7 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
@@ -3989,7 +3964,7 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
@@ -3999,22 +3974,22 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
+#define _mm512_cvt_roundpd_epi32(A, R) \
   (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
                                            (__v8si)_mm256_setzero_si256(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
   (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
                                            (__v8si)(__m256i)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
   (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
                                            (__v8si)_mm256_setzero_si256(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtpd_epi32 (__m512d __A)
 {
   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
@@ -4024,7 +3999,7 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
 {
   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
@@ -4033,7 +4008,7 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
 {
   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
@@ -4043,32 +4018,32 @@
                  _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
+#define _mm512_cvt_roundps_epu32(A, R) \
   (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
                                             (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)-1, (int)(R)); })
+                                            (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
   (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
                                             (__v16si)(__m512i)(W), \
-                                            (__mmask16)(U), (int)(R)); })
+                                            (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundps_epu32(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
   (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
                                             (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)(U), (int)(R)); })
+                                            (__mmask16)(U), (int)(R))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtps_epu32 ( __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
                   (__v16si)\
-                  _mm512_undefined_epi32 (),\
+                  _mm512_undefined_epi32 (),
                   (__mmask16) -1,\
-                  _MM_FROUND_CUR_DIRECTION);\
+                  _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
@@ -4077,32 +4052,32 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
-                  (__v16si) 
+                  (__v16si)
                   _mm512_setzero_si512 (),
                   (__mmask16) __U ,
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
+#define _mm512_cvt_roundpd_epu32(A, R) \
   (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
                                             (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
   (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__v8si)(__m256i)(W), \
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
   (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
                                             (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtpd_epu32 (__m512d __A)
 {
   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
@@ -4112,7 +4087,7 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
 {
   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
@@ -4121,7 +4096,7 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
 {
   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
@@ -4131,16 +4106,28 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_cvtsd_f64(__m512d __a)
+{
+  return __a[0];
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_cvtss_f32(__m512 __a)
+{
+  return __a[0];
+}
+
 /* Unpack and Interleave */
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_unpackhi_pd(__m512d __a, __m512d __b)
 {
   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
@@ -4148,7 +4135,7 @@
                                            (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
@@ -4156,14 +4143,14 @@
                                            (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_unpacklo_pd(__m512d __a, __m512d __b)
 {
   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
@@ -4171,7 +4158,7 @@
                                            (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
@@ -4179,7 +4166,7 @@
                                            (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_unpackhi_ps(__m512 __a, __m512 __b)
 {
   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
@@ -4189,7 +4176,7 @@
                                          2+12, 18+12, 3+12, 19+12);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
@@ -4197,7 +4184,7 @@
                                           (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
@@ -4205,7 +4192,7 @@
                                           (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_unpacklo_ps(__m512 __a, __m512 __b)
 {
   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
@@ -4215,7 +4202,7 @@
                                          0+12, 16+12, 1+12, 17+12);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
@@ -4223,7 +4210,7 @@
                                           (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
@@ -4231,7 +4218,7 @@
                                           (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_unpackhi_epi32(__m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
@@ -4241,7 +4228,7 @@
                                           2+12, 18+12, 3+12, 19+12);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
@@ -4249,7 +4236,7 @@
                                        (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
@@ -4257,7 +4244,7 @@
                                        (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_unpacklo_epi32(__m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
@@ -4267,7 +4254,7 @@
                                           0+12, 16+12, 1+12, 17+12);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
@@ -4275,7 +4262,7 @@
                                        (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
@@ -4283,14 +4270,14 @@
                                        (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_unpackhi_epi64(__m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
@@ -4298,7 +4285,7 @@
                                         (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
@@ -4306,14 +4293,14 @@
                                         (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
@@ -4321,7 +4308,7 @@
                                         (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
@@ -4329,50 +4316,19 @@
                                         (__v8di)_mm512_setzero_si512());
 }
 
-/* Bit Test */
-
-static __inline __mmask16 __DEFAULT_FN_ATTRS
-_mm512_test_epi32_mask(__m512i __A, __m512i __B)
-{
-  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
-            (__v16si) __B,
-            (__mmask16) -1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
-                 (__v16si) __B, __U);
-}
-
-static __inline __mmask8 __DEFAULT_FN_ATTRS
-_mm512_test_epi64_mask(__m512i __A, __m512i __B)
-{
-  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
-                 (__v8di) __B,
-                 (__mmask8) -1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
-}
-
 
 /* SIMD load ops */
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_loadu_si512 (void const *__P)
 {
-  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
-                  (__v16si)
-                  _mm512_setzero_si512 (),
-                  (__mmask16) -1);
+  struct __loadu_si512 {
+    __m512i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si512*)__P)->__v;
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
@@ -4381,7 +4337,7 @@
 }
 
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
@@ -4390,7 +4346,7 @@
                                                      (__mmask16) __U);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
@@ -4398,7 +4354,7 @@
                   (__mmask8) __U);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
@@ -4407,7 +4363,7 @@
                                                      (__mmask8) __U);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
 {
   return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
@@ -4415,7 +4371,7 @@
                    (__mmask16) __U);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
 {
   return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
@@ -4424,7 +4380,7 @@
                                                   (__mmask16) __U);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
 {
   return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
@@ -4432,7 +4388,7 @@
                 (__mmask8) __U);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
 {
   return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
@@ -4441,8 +4397,8 @@
                                                    (__mmask8) __U);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
-_mm512_loadu_pd(double const *__p)
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_loadu_pd(void const *__p)
 {
   struct __loadu_pd {
     __m512d __v;
@@ -4450,8 +4406,8 @@
   return ((struct __loadu_pd*)__p)->__v;
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
-_mm512_loadu_ps(float const *__p)
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_loadu_ps(void const *__p)
 {
   struct __loadu_ps {
     __m512 __v;
@@ -4459,16 +4415,13 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
-_mm512_load_ps(float const *__p)
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_load_ps(void const *__p)
 {
-  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
-                                                  (__v16sf)
-                                                  _mm512_setzero_ps (),
-                                                  (__mmask16) -1);
+  return *(__m512*)__p;
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
 {
   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
@@ -4476,7 +4429,7 @@
                    (__mmask16) __U);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
 {
   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
@@ -4485,16 +4438,13 @@
                                                   (__mmask16) __U);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
-_mm512_load_pd(double const *__p)
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_load_pd(void const *__p)
 {
-  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
-                                                   (__v8df)
-                                                   _mm512_setzero_pd (),
-                                                   (__mmask8) -1);
+  return *(__m512d*)__p;
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
 {
   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
@@ -4502,7 +4452,7 @@
                           (__mmask8) __U);
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_load_pd(__mmask8 __U, void const *__P)
 {
   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
@@ -4511,19 +4461,19 @@
                                                    (__mmask8) __U);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_load_si512 (void const *__P)
 {
   return *(__m512i *) __P;
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_load_epi32 (void const *__P)
 {
   return *(__m512i *) __P;
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_load_epi64 (void const *__P)
 {
   return *(__m512i *) __P;
@@ -4531,90 +4481,98 @@
 
 /* SIMD store ops */
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
 {
   __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
                                      (__mmask8) __U);
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_si512 (void *__P, __m512i __A)
 {
-  __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
-            (__mmask16) -1);
+  struct __storeu_si512 {
+    __m512i __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si512*)__P)->__v = __A;
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
 {
   __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
                                      (__mmask16) __U);
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
 {
   __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_pd(void *__P, __m512d __A)
 {
-  __builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)-1);
+  struct __storeu_pd {
+    __m512d __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pd*)__P)->__v = __A;
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
 {
   __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
                                    (__mmask16) __U);
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_ps(void *__P, __m512 __A)
 {
-  __builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)-1);
+  struct __storeu_ps {
+    __m512 __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ps*)__P)->__v = __A;
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
 {
   __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_store_pd(void *__P, __m512d __A)
 {
   *(__m512d*)__P = __A;
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
 {
   __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
                                    (__mmask16) __U);
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_store_ps(void *__P, __m512 __A)
 {
   *(__m512*)__P = __A;
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_store_si512 (void *__P, __m512i __A)
 {
   *(__m512i *) __P = __A;
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_store_epi32 (void *__P, __m512i __A)
 {
   *(__m512i *) __P = __A;
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+static __inline void __DEFAULT_FN_ATTRS512
 _mm512_store_epi64 (void *__P, __m512i __A)
 {
   *(__m512i *) __P = __A;
@@ -4622,7 +4580,7 @@
 
 /* Mask ops */
 
-static __inline __mmask16 __DEFAULT_FN_ATTRS
+static __inline __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_knot(__mmask16 __M)
 {
   return __builtin_ia32_knothi(__M);
@@ -4630,832 +4588,599 @@
 
 /* Integer compare */
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
-                                                   (__mmask16)-1);
-}
+#define _mm512_cmpeq_epi32_mask(A, B) \
+    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
+    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi32_mask(A, B) \
+    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi32_mask(k, A, B) \
+    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi32_mask(A, B) \
+    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
+    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi32_mask(A, B) \
+    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi32_mask(k, A, B) \
+    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi32_mask(A, B) \
+    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi32_mask(k, A, B) \
+    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi32_mask(A, B) \
+    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
+    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
-                                                   __u);
-}
+#define _mm512_cmpeq_epu32_mask(A, B) \
+    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
+    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu32_mask(A, B) \
+    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu32_mask(k, A, B) \
+    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu32_mask(A, B) \
+    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
+    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu32_mask(A, B) \
+    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu32_mask(k, A, B) \
+    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu32_mask(A, B) \
+    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu32_mask(k, A, B) \
+    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu32_mask(A, B) \
+    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
+    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
-                                                 (__mmask16)-1);
-}
+#define _mm512_cmpeq_epi64_mask(A, B) \
+    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
+    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi64_mask(A, B) \
+    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi64_mask(k, A, B) \
+    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi64_mask(A, B) \
+    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
+    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi64_mask(A, B) \
+    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi64_mask(k, A, B) \
+    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi64_mask(A, B) \
+    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi64_mask(k, A, B) \
+    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi64_mask(A, B) \
+    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
+    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
-                                                 __u);
-}
+#define _mm512_cmpeq_epu64_mask(A, B) \
+    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
+    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu64_mask(A, B) \
+    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu64_mask(k, A, B) \
+    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu64_mask(A, B) \
+    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
+    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu64_mask(A, B) \
+    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu64_mask(k, A, B) \
+    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu64_mask(A, B) \
+    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu64_mask(k, A, B) \
+    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu64_mask(A, B) \
+    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
+    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
-                                                   (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
-                                                   __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
-                                                __u);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi32 (__m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi8_epi32(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) -1);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepi8_epi32(__A),
+                                             (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepi8_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi64 (__m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi8_epi64(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi8_epi64(__A),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi8_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512 ());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi32_epi64 (__m256i __X)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32_epi64(__m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi32_epi64(__X),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi32_epi64(__X),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi16_epi32 (__m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi16_epi32(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) -1);
+  return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepi16_epi32(__A),
+                                            (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepi16_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512 ());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi16_epi64 (__m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi16_epi64(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi16_epi64(__A),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi16_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi32 (__m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu8_epi32(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) -1);
+  return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepu8_epi32(__A),
+                                             (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepu8_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi64 (__m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu8_epi64(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu8_epi64(__A),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu8_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu32_epi64 (__m256i __X)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu32_epi64(__m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu32_epi64(__X),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
 {
-  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu32_epi64(__X),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu16_epi32 (__m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu16_epi32(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) -1);
+  return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepu16_epi32(__A),
+                                            (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
-                (__v16si)
-                _mm512_setzero_si512 (),
-                (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepu16_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu16_epi64 (__m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu16_epi64(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) -1);
+  return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu16_epi64(__A),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
-                (__v8di)
-                _mm512_setzero_si512 (),
-                (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu16_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_rorv_epi32 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                           (__v16si)_mm512_rorv_epi32(__A, __B),
+                                           (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                           (__v16si)_mm512_rorv_epi32(__A, __B),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_rorv_epi64 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di) __W,
-              (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                                            (__v8di)_mm512_rorv_epi64(__A, __B),
+                                            (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                                            (__v8di)_mm512_rorv_epi64(__A, __B),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
 
 
-#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
+#define _mm512_cmp_epi32_mask(a, b, p) \
   (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
                                          (__v16si)(__m512i)(b), (int)(p), \
-                                         (__mmask16)-1); })
+                                         (__mmask16)-1)
 
-#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
+#define _mm512_cmp_epu32_mask(a, b, p) \
   (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
                                           (__v16si)(__m512i)(b), (int)(p), \
-                                          (__mmask16)-1); })
+                                          (__mmask16)-1)
 
-#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
+#define _mm512_cmp_epi64_mask(a, b, p) \
   (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
                                         (__v8di)(__m512i)(b), (int)(p), \
-                                        (__mmask8)-1); })
+                                        (__mmask8)-1)
 
-#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
+#define _mm512_cmp_epu64_mask(a, b, p) \
   (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
                                          (__v8di)(__m512i)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
   (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
                                          (__v16si)(__m512i)(b), (int)(p), \
-                                         (__mmask16)(m)); })
+                                         (__mmask16)(m))
 
-#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
   (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
                                           (__v16si)(__m512i)(b), (int)(p), \
-                                          (__mmask16)(m)); })
+                                          (__mmask16)(m))
 
-#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
                                         (__v8di)(__m512i)(b), (int)(p), \
-                                        (__mmask8)(m)); })
+                                        (__mmask8)(m))
 
-#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
                                          (__v8di)(__m512i)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-#define _mm512_rol_epi32(a, b) __extension__ ({ \
-  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
-                                        (__v16si)_mm512_setzero_si512(), \
-                                        (__mmask16)-1); })
+#define _mm512_rol_epi32(a, b) \
+  (__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b))
 
-#define _mm512_mask_rol_epi32(W, U, a, b) __extension__ ({ \
-  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
-                                        (__v16si)(__m512i)(W), \
-                                        (__mmask16)(U)); })
+#define _mm512_mask_rol_epi32(W, U, a, b) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_rol_epi32((a), (b)), \
+                                      (__v16si)(__m512i)(W))
 
-#define _mm512_maskz_rol_epi32(U, a, b) __extension__ ({ \
-  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
-                                        (__v16si)_mm512_setzero_si512(), \
-                                        (__mmask16)(U)); })
+#define _mm512_maskz_rol_epi32(U, a, b) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_rol_epi32((a), (b)), \
+                                      (__v16si)_mm512_setzero_si512())
 
-#define _mm512_rol_epi64(a, b) __extension__ ({ \
-  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
-                                        (__v8di)_mm512_setzero_si512(), \
-                                        (__mmask8)-1); })
+#define _mm512_rol_epi64(a, b) \
+  (__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b))
 
-#define _mm512_mask_rol_epi64(W, U, a, b) __extension__ ({ \
-  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
-                                        (__v8di)(__m512i)(W), (__mmask8)(U)); })
+#define _mm512_mask_rol_epi64(W, U, a, b) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_rol_epi64((a), (b)), \
+                                      (__v8di)(__m512i)(W))
 
-#define _mm512_maskz_rol_epi64(U, a, b) __extension__ ({ \
-  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
-                                        (__v8di)_mm512_setzero_si512(), \
-                                        (__mmask8)(U)); })
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+#define _mm512_maskz_rol_epi64(U, a, b) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_rol_epi64((a), (b)), \
+                                      (__v8di)_mm512_setzero_si512())
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_rolv_epi32 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                           (__v16si)_mm512_rolv_epi32(__A, __B),
+                                           (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                           (__v16si)_mm512_rolv_epi32(__A, __B),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_rolv_epi64 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) -1);
+  return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di) __W,
-              (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                                            (__v8di)_mm512_rolv_epi64(__A, __B),
+                                            (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
-              (__v8di) __B,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                                            (__v8di)_mm512_rolv_epi64(__A, __B),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
-#define _mm512_ror_epi32(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                        (__v16si)_mm512_setzero_si512(), \
-                                        (__mmask16)-1); })
+#define _mm512_ror_epi32(A, B) \
+  (__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B))
 
-#define _mm512_mask_ror_epi32(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                        (__v16si)(__m512i)(W), \
-                                        (__mmask16)(U)); })
+#define _mm512_mask_ror_epi32(W, U, A, B) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_ror_epi32((A), (B)), \
+                                      (__v16si)(__m512i)(W))
 
-#define _mm512_maskz_ror_epi32(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                        (__v16si)_mm512_setzero_si512(), \
-                                        (__mmask16)(U)); })
+#define _mm512_maskz_ror_epi32(U, A, B) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_ror_epi32((A), (B)), \
+                                      (__v16si)_mm512_setzero_si512())
 
-#define _mm512_ror_epi64(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                        (__v8di)_mm512_setzero_si512(), \
-                                        (__mmask8)-1); })
+#define _mm512_ror_epi64(A, B) \
+  (__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B))
 
-#define _mm512_mask_ror_epi64(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                        (__v8di)(__m512i)(W), (__mmask8)(U)); })
+#define _mm512_mask_ror_epi64(W, U, A, B) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_ror_epi64((A), (B)), \
+                                      (__v8di)(__m512i)(W))
 
-#define _mm512_maskz_ror_epi64(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                        (__v8di)_mm512_setzero_si512(), \
-                                        (__mmask8)(U)); })
+#define _mm512_maskz_ror_epi64(U, A, B) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_ror_epi64((A), (B)), \
+                                      (__v8di)_mm512_setzero_si512())
 
-#define _mm512_slli_epi32(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_slli_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+}
 
-#define _mm512_mask_slli_epi32(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_slli_epi32(__A, __B),
+                                         (__v16si)__W);
+}
 
-#define _mm512_maskz_slli_epi32(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_slli_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
 
-#define _mm512_slli_epi64(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_slli_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+}
 
-#define _mm512_mask_slli_epi64(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)(__m512i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_slli_epi64(__A, __B),
+                                          (__v8di)__W);
+}
 
-#define _mm512_maskz_slli_epi64(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_slli_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srli_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+}
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srli_epi32(__A, __B),
+                                         (__v16si)__W);
+}
 
-#define _mm512_srli_epi32(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srli_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
 
-#define _mm512_mask_srli_epi32(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srli_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+}
 
-#define _mm512_maskz_srli_epi32(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srli_epi64(__A, __B),
+                                          (__v8di)__W);
+}
 
-#define _mm512_srli_epi64(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srli_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
 
-#define _mm512_mask_srli_epi64(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)(__m512i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm512_maskz_srli_epi64(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)(U)); })
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
@@ -5463,7 +5188,7 @@
               (__mmask16) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
@@ -5472,14 +5197,14 @@
               (__mmask16) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
 {
   __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
           (__mmask16) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
@@ -5487,7 +5212,7 @@
                  (__v16si) __W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
@@ -5495,7 +5220,7 @@
                  (__v16si) _mm512_setzero_si512 ());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
@@ -5503,7 +5228,7 @@
                  (__v8di) __W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
@@ -5511,7 +5236,7 @@
                  (__v8di) _mm512_setzero_si512 ());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
@@ -5519,7 +5244,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
@@ -5528,21 +5253,21 @@
               (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
 {
   __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
           (__mmask8) __U);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_movedup_pd (__m512d __A)
 {
   return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
                                           0, 0, 2, 2, 4, 4, 6, 6);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
@@ -5550,7 +5275,7 @@
                                               (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
 {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
@@ -5558,179 +5283,179 @@
                                               (__v8df)_mm512_setzero_pd());
 }
 
-#define _mm512_fixupimm_round_pd(A, B, C, imm, R) __extension__ ({ \
+#define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
   (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
                                              (__v8df)(__m512d)(B), \
                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)-1, (int)(R)); })
+                                             (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) __extension__ ({ \
+#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
   (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
                                              (__v8df)(__m512d)(B), \
                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
-#define _mm512_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+#define _mm512_fixupimm_pd(A, B, C, imm) \
   (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
                                              (__v8df)(__m512d)(B), \
                                              (__v8di)(__m512i)(C), (int)(imm), \
                                              (__mmask8)-1, \
-                                             _MM_FROUND_CUR_DIRECTION); })
+                                             _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
   (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
                                              (__v8df)(__m512d)(B), \
                                              (__v8di)(__m512i)(C), (int)(imm), \
                                              (__mmask8)(U), \
-                                             _MM_FROUND_CUR_DIRECTION); })
+                                             _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) __extension__ ({ \
+#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
   (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
                                               (__v8df)(__m512d)(B), \
                                               (__v8di)(__m512i)(C), \
                                               (int)(imm), (__mmask8)(U), \
-                                              (int)(R)); })
+                                              (int)(R))
 
-#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
   (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
                                               (__v8df)(__m512d)(B), \
                                               (__v8di)(__m512i)(C), \
                                               (int)(imm), (__mmask8)(U), \
-                                              _MM_FROUND_CUR_DIRECTION); })
+                                              _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_fixupimm_round_ps(A, B, C, imm, R) __extension__ ({ \
+#define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
   (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
                                             (__v16sf)(__m512)(B), \
                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)-1, (int)(R)); })
+                                            (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) __extension__ ({ \
+#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
   (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
                                             (__v16sf)(__m512)(B), \
                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)(U), (int)(R)); })
+                                            (__mmask16)(U), (int)(R))
 
-#define _mm512_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+#define _mm512_fixupimm_ps(A, B, C, imm) \
   (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
                                             (__v16sf)(__m512)(B), \
                                             (__v16si)(__m512i)(C), (int)(imm), \
                                             (__mmask16)-1, \
-                                            _MM_FROUND_CUR_DIRECTION); })
+                                            _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
   (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
                                             (__v16sf)(__m512)(B), \
                                             (__v16si)(__m512i)(C), (int)(imm), \
                                             (__mmask16)(U), \
-                                            _MM_FROUND_CUR_DIRECTION); })
+                                            _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) __extension__ ({ \
+#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
   (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
                                              (__v16sf)(__m512)(B), \
                                              (__v16si)(__m512i)(C), \
                                              (int)(imm), (__mmask16)(U), \
-                                             (int)(R)); })
+                                             (int)(R))
 
-#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
   (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
                                              (__v16sf)(__m512)(B), \
                                              (__v16si)(__m512i)(C), \
                                              (int)(imm), (__mmask16)(U), \
-                                             _MM_FROUND_CUR_DIRECTION); })
+                                             _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_fixupimm_round_sd(A, B, C, imm, R) __extension__ ({ \
+#define _mm_fixupimm_round_sd(A, B, C, imm, R) \
   (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), \
                                           (__v2di)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)-1, (int)(R)); })
+                                          (__mmask8)-1, (int)(R))
 
-#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) __extension__ ({ \
+#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
   (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), \
                                           (__v2di)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-#define _mm_fixupimm_sd(A, B, C, imm) __extension__ ({ \
+#define _mm_fixupimm_sd(A, B, C, imm) \
   (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), \
                                           (__v2di)(__m128i)(C), (int)(imm), \
                                           (__mmask8)-1, \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_fixupimm_sd(A, U, B, C, imm) __extension__ ({ \
+#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
   (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), \
                                           (__v2di)(__m128i)(C), (int)(imm), \
                                           (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) __extension__ ({ \
+#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
   (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2di)(__m128i)(C), (int)(imm), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) __extension__ ({ \
+#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
   (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2di)(__m128i)(C), (int)(imm), \
                                            (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION); })
+                                           _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_fixupimm_round_ss(A, B, C, imm, R) __extension__ ({ \
+#define _mm_fixupimm_round_ss(A, B, C, imm, R) \
   (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), \
                                          (__v4si)(__m128i)(C), (int)(imm), \
-                                         (__mmask8)-1, (int)(R)); })
+                                         (__mmask8)-1, (int)(R))
 
-#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) __extension__ ({ \
+#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
   (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), \
                                          (__v4si)(__m128i)(C), (int)(imm), \
-                                         (__mmask8)(U), (int)(R)); })
+                                         (__mmask8)(U), (int)(R))
 
-#define _mm_fixupimm_ss(A, B, C, imm) __extension__ ({ \
+#define _mm_fixupimm_ss(A, B, C, imm) \
   (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), \
                                          (__v4si)(__m128i)(C), (int)(imm), \
                                          (__mmask8)-1, \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_fixupimm_ss(A, U, B, C, imm) __extension__ ({ \
+#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
   (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), \
                                          (__v4si)(__m128i)(C), (int)(imm), \
                                          (__mmask8)(U), \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) __extension__ ({ \
+#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
   (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4si)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) __extension__ ({ \
+#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
   (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4si)(__m128i)(C), (int)(imm), \
                                           (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_getexp_round_sd(A, B, R) __extension__ ({ \
+#define _mm_getexp_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
                                                  (__v2df)(__m128d)(B), \
                                                  (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)-1, (int)(R)); })
+                                                 (__mmask8)-1, (int)(R))
 
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_getexp_sd (__m128d __A, __m128d __B)
 {
   return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
                  (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
@@ -5740,13 +5465,13 @@
           _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_getexp_round_sd(W, U, A, B, R) __extension__ ({\
+#define _mm_mask_getexp_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
                                                  (__v2df)(__m128d)(B), \
                                                  (__v2df)(__m128d)(W), \
-                                                 (__mmask8)(U), (int)(R)); })
+                                                 (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
@@ -5756,26 +5481,26 @@
           _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_getexp_round_sd(U, A, B, R) __extension__ ({\
+#define _mm_maskz_getexp_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
                                                  (__v2df)(__m128d)(B), \
                                                  (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)(U), (int)(R)); })
+                                                 (__mmask8)(U), (int)(R))
 
-#define _mm_getexp_round_ss(A, B, R) __extension__ ({ \
+#define _mm_getexp_round_ss(A, B, R) \
   (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
                                                 (__v4sf)(__m128)(B), \
                                                 (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)-1, (int)(R)); })
+                                                (__mmask8)-1, (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_getexp_ss (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
                 (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
@@ -5785,942 +5510,765 @@
           _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_getexp_round_ss(W, U, A, B, R) __extension__ ({\
+#define _mm_mask_getexp_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
                                                 (__v4sf)(__m128)(B), \
                                                 (__v4sf)(__m128)(W), \
-                                                (__mmask8)(U), (int)(R)); })
+                                                (__mmask8)(U), (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
           (__v4sf) __B,
-          (__v4sf) _mm_setzero_pd (),
+          (__v4sf) _mm_setzero_ps (),
           (__mmask8) __U,
           _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_getexp_round_ss(U, A, B, R) __extension__ ({\
+#define _mm_maskz_getexp_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
                                                 (__v4sf)(__m128)(B), \
                                                 (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)(U), (int)(R)); })
+                                                (__mmask8)(U), (int)(R))
 
-#define _mm_getmant_round_sd(A, B, C, D, R) __extension__ ({ \
+#define _mm_getmant_round_sd(A, B, C, D, R) \
   (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (int)(((D)<<2) | (C)), \
                                                (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1, (int)(R)); })
+                                               (__mmask8)-1, (int)(R))
 
-#define _mm_getmant_sd(A, B, C, D)  __extension__ ({ \
+#define _mm_getmant_sd(A, B, C, D)  \
   (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (int)(((D)<<2) | (C)), \
                                                (__v2df)_mm_setzero_pd(), \
                                                (__mmask8)-1, \
-                                               _MM_FROUND_CUR_DIRECTION); })
+                                               _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_getmant_sd(W, U, A, B, C, D) __extension__ ({\
+#define _mm_mask_getmant_sd(W, U, A, B, C, D) \
   (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (int)(((D)<<2) | (C)), \
                                                (__v2df)(__m128d)(W), \
                                                (__mmask8)(U), \
-                                               _MM_FROUND_CUR_DIRECTION); })
+                                               _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R)({\
+#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
   (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (int)(((D)<<2) | (C)), \
                                                (__v2df)(__m128d)(W), \
-                                               (__mmask8)(U), (int)(R)); })
+                                               (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_getmant_sd(U, A, B, C, D) __extension__ ({\
+#define _mm_maskz_getmant_sd(U, A, B, C, D) \
   (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (int)(((D)<<2) | (C)), \
                                                (__v2df)_mm_setzero_pd(), \
                                                (__mmask8)(U), \
-                                               _MM_FROUND_CUR_DIRECTION); })
+                                               _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) __extension__ ({\
+#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
   (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (int)(((D)<<2) | (C)), \
                                                (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(U), (int)(R)); })
+                                               (__mmask8)(U), (int)(R))
 
-#define _mm_getmant_round_ss(A, B, C, D, R) __extension__ ({ \
+#define _mm_getmant_round_ss(A, B, C, D, R) \
   (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (int)(((D)<<2) | (C)), \
                                               (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1, (int)(R)); })
+                                              (__mmask8)-1, (int)(R))
 
-#define _mm_getmant_ss(A, B, C, D) __extension__ ({ \
+#define _mm_getmant_ss(A, B, C, D) \
   (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (int)(((D)<<2) | (C)), \
                                               (__v4sf)_mm_setzero_ps(), \
                                               (__mmask8)-1, \
-                                              _MM_FROUND_CUR_DIRECTION); })
+                                              _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_getmant_ss(W, U, A, B, C, D) __extension__ ({\
+#define _mm_mask_getmant_ss(W, U, A, B, C, D) \
   (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (int)(((D)<<2) | (C)), \
                                               (__v4sf)(__m128)(W), \
                                               (__mmask8)(U), \
-                                              _MM_FROUND_CUR_DIRECTION); })
+                                              _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R)({\
+#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
   (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (int)(((D)<<2) | (C)), \
                                               (__v4sf)(__m128)(W), \
-                                              (__mmask8)(U), (int)(R)); })
+                                              (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_getmant_ss(U, A, B, C, D) __extension__ ({\
-  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (int)(((D)<<2) | (C)), \
-                                              (__v4sf)_mm_setzero_pd(), \
-                                              (__mmask8)(U), \
-                                              _MM_FROUND_CUR_DIRECTION); })
-
-#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) __extension__ ({\
+#define _mm_maskz_getmant_ss(U, A, B, C, D) \
   (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (int)(((D)<<2) | (C)), \
                                               (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(U), (int)(R)); })
+                                              (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(U), (int)(R))
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_kmov (__mmask16 __A)
 {
   return  __A;
 }
 
-#define _mm_comi_round_sd(A, B, P, R) __extension__ ({\
+#define _mm_comi_round_sd(A, B, P, R) \
   (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
-                              (int)(P), (int)(R)); })
+                              (int)(P), (int)(R))
 
-#define _mm_comi_round_ss(A, B, P, R) __extension__ ({\
+#define _mm_comi_round_ss(A, B, P, R) \
   (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
-                              (int)(P), (int)(R)); })
+                              (int)(P), (int)(R))
 
-#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
-  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_si64(A, R) \
+  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))
+#endif
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
-         __mmask16 __U, __m512i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sll_epi32(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
-                   (__v16si) __I
-                   /* idx */ ,
-                   (__v16si) __B,
-                   (__mmask16) __U);
+  return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi32 (__m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sll_epi32(__A, __B),
+                                          (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sll_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sll_epi64(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi64 (__m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sll_epi64(__A, __B),
+                                             (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sll_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sllv_epi32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
+                                           (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sllv_epi64(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_undefined_pd (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
+                                            (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sra_epi32(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi32 (__m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sra_epi32(__A, __B),
+                                          (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sra_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sra_epi64(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi64 (__m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sra_epi64(__A, __B),
+                                           (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sra_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srav_epi32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srav_epi32(__X, __Y),
+                                           (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srav_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srav_epi64(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srav_epi64(__X, __Y),
+                                            (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srav_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srl_epi32(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi32 (__m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_srl_epi32(__A, __B),
+                                          (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si) __W,
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_srl_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srl_epi64(__m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
-             (__v4si) __B,
-             (__v16si)
-             _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi64 (__m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_srl_epi64(__A, __B),
+                                           (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
 {
-  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_srl_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srlv_epi32(__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
-             (__v2di) __B,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
+                                           (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si) __W,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
-              (__v16si) __Y,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srlv_epi64 (__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di) __W,
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
+                                            (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
-             (__v8di) __Y,
-             (__v8di)
-             _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
-#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+#define _mm512_ternarylogic_epi32(A, B, C, imm) \
   (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
                                             (__v16si)(__m512i)(B), \
                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)-1); })
+                                            (__mmask16)-1)
 
-#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \
   (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
                                             (__v16si)(__m512i)(B), \
                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)(U)); })
+                                            (__mmask16)(U))
 
-#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \
   (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
                                              (__v16si)(__m512i)(B), \
                                              (__v16si)(__m512i)(C), \
-                                             (int)(imm), (__mmask16)(U)); })
+                                             (int)(imm), (__mmask16)(U))
 
-#define _mm512_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+#define _mm512_ternarylogic_epi64(A, B, C, imm) \
   (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
                                             (__v8di)(__m512i)(B), \
                                             (__v8di)(__m512i)(C), (int)(imm), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \
   (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
                                             (__v8di)(__m512i)(B), \
                                             (__v8di)(__m512i)(C), (int)(imm), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
   (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
                                              (__v8di)(__m512i)(B), \
                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
-  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_i64(A, R) \
+  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))
+#endif
 
-#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
-  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+#define _mm_cvt_roundsd_si32(A, R) \
+  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))
 
-#define _mm_cvt_roundsd_i32(A, R) __extension__ ({ \
-  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+#define _mm_cvt_roundsd_i32(A, R) \
+  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))
 
-#define _mm_cvt_roundsd_u32(A, R) __extension__ ({ \
-  (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
+#define _mm_cvt_roundsd_u32(A, R) \
+  (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R))
 
-static __inline__ unsigned __DEFAULT_FN_ATTRS
+static __inline__ unsigned __DEFAULT_FN_ATTRS128
 _mm_cvtsd_u32 (__m128d __A)
 {
   return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
              _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_u64(A, R) \
   (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
-                                                  (int)(R)); })
+                                                  (int)(R))
 
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
 _mm_cvtsd_u64 (__m128d __A)
 {
   return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
                  __A,
                  _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
-#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
-  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+#define _mm_cvt_roundss_si32(A, R) \
+  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))
 
-#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
-  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+#define _mm_cvt_roundss_i32(A, R) \
+  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))
 
-#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
-  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#ifdef __x86_64__
+#define _mm_cvt_roundss_si64(A, R) \
+  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))
 
-#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
-  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#define _mm_cvt_roundss_i64(A, R) \
+  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))
+#endif
 
-#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
-  (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })
+#define _mm_cvt_roundss_u32(A, R) \
+  (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R))
 
-static __inline__ unsigned __DEFAULT_FN_ATTRS
+static __inline__ unsigned __DEFAULT_FN_ATTRS128
 _mm_cvtss_u32 (__m128 __A)
 {
   return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
              _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
+#ifdef __x86_64__
+#define _mm_cvt_roundss_u64(A, R) \
   (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
-                                                  (int)(R)); })
+                                                  (int)(R))
 
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
 _mm_cvtss_u64 (__m128 __A)
 {
   return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
                  __A,
                  _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
-#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
-  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+#define _mm_cvtt_roundsd_i32(A, R) \
+  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))
 
-#define _mm_cvtt_roundsd_si32(A, R) __extension__ ({ \
-  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+#define _mm_cvtt_roundsd_si32(A, R) \
+  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))
 
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS128
 _mm_cvttsd_i32 (__m128d __A)
 {
   return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
               _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
-  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_si64(A, R) \
+  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))
 
-#define _mm_cvtt_roundsd_i64(A, R) __extension__ ({ \
-  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#define _mm_cvtt_roundsd_i64(A, R) \
+  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))
 
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS128
 _mm_cvttsd_i64 (__m128d __A)
 {
   return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
               _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
-#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
-  (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
+#define _mm_cvtt_roundsd_u32(A, R) \
+  (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R))
 
-static __inline__ unsigned __DEFAULT_FN_ATTRS
+static __inline__ unsigned __DEFAULT_FN_ATTRS128
 _mm_cvttsd_u32 (__m128d __A)
 {
   return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
               _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_u64(A, R) \
   (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
-                                                   (int)(R)); })
+                                                   (int)(R))
 
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
 _mm_cvttsd_u64 (__m128d __A)
 {
   return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
                   __A,
                   _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
-#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
-  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
+#define _mm_cvtt_roundss_i32(A, R) \
+  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))
 
-#define _mm_cvtt_roundss_si32(A, R) __extension__ ({ \
-  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
+#define _mm_cvtt_roundss_si32(A, R) \
+  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))
 
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS128
 _mm_cvttss_i32 (__m128 __A)
 {
   return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
               _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
-  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_i64(A, R) \
+  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))
 
-#define _mm_cvtt_roundss_si64(A, R) __extension__ ({ \
-  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#define _mm_cvtt_roundss_si64(A, R) \
+  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))
 
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS128
 _mm_cvttss_i64 (__m128 __A)
 {
   return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
               _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
-#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
-  (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })
+#define _mm_cvtt_roundss_u32(A, R) \
+  (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R))
 
-static __inline__ unsigned __DEFAULT_FN_ATTRS
+static __inline__ unsigned __DEFAULT_FN_ATTRS128
 _mm_cvttss_u32 (__m128 __A)
 {
   return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
               _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_u64(A, R) \
   (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
-                                                   (int)(R)); })
+                                                   (int)(R))
 
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
 _mm_cvttss_u64 (__m128 __A)
 {
   return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
                   __A,
                   _MM_FROUND_CUR_DIRECTION);
 }
+#endif
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
-            __m512d __B)
-{
-  return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
-              (__v8di) __I
-              /* idx */ ,
-              (__v8df) __B,
-              (__mmask8) __U);
-}
+#define _mm512_permute_pd(X, C) \
+  (__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
-            __m512 __B)
-{
-  return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
-                   (__v16si) __I
-                   /* idx */ ,
-                   (__v16sf) __B,
-                   (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
-         __mmask8 __U, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
-                   (__v8di) __I
-                   /* idx */ ,
-                   (__v8di) __B,
-                   (__mmask8) __U);
-}
-
-#define _mm512_permute_pd(X, C) __extension__ ({ \
-  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
-                                   (__v8df)_mm512_undefined_pd(), \
-                                   0 + (((C) >> 0) & 0x1), \
-                                   0 + (((C) >> 1) & 0x1), \
-                                   2 + (((C) >> 2) & 0x1), \
-                                   2 + (((C) >> 3) & 0x1), \
-                                   4 + (((C) >> 4) & 0x1), \
-                                   4 + (((C) >> 5) & 0x1), \
-                                   6 + (((C) >> 6) & 0x1), \
-                                   6 + (((C) >> 7) & 0x1)); })
-
-#define _mm512_mask_permute_pd(W, U, X, C) __extension__ ({ \
+#define _mm512_mask_permute_pd(W, U, X, C) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                        (__v8df)_mm512_permute_pd((X), (C)), \
-                                       (__v8df)(__m512d)(W)); })
+                                       (__v8df)(__m512d)(W))
 
-#define _mm512_maskz_permute_pd(U, X, C) __extension__ ({ \
+#define _mm512_maskz_permute_pd(U, X, C) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                        (__v8df)_mm512_permute_pd((X), (C)), \
-                                       (__v8df)_mm512_setzero_pd()); })
+                                       (__v8df)_mm512_setzero_pd())
 
-#define _mm512_permute_ps(X, C) __extension__ ({ \
-  (__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \
-                                  (__v16sf)_mm512_undefined_ps(), \
-                                   0  + (((C) >> 0) & 0x3), \
-                                   0  + (((C) >> 2) & 0x3), \
-                                   0  + (((C) >> 4) & 0x3), \
-                                   0  + (((C) >> 6) & 0x3), \
-                                   4  + (((C) >> 0) & 0x3), \
-                                   4  + (((C) >> 2) & 0x3), \
-                                   4  + (((C) >> 4) & 0x3), \
-                                   4  + (((C) >> 6) & 0x3), \
-                                   8  + (((C) >> 0) & 0x3), \
-                                   8  + (((C) >> 2) & 0x3), \
-                                   8  + (((C) >> 4) & 0x3), \
-                                   8  + (((C) >> 6) & 0x3), \
-                                   12 + (((C) >> 0) & 0x3), \
-                                   12 + (((C) >> 2) & 0x3), \
-                                   12 + (((C) >> 4) & 0x3), \
-                                   12 + (((C) >> 6) & 0x3)); })
+#define _mm512_permute_ps(X, C) \
+  (__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))
 
-#define _mm512_mask_permute_ps(W, U, X, C) __extension__ ({ \
+#define _mm512_mask_permute_ps(W, U, X, C) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                       (__v16sf)_mm512_permute_ps((X), (C)), \
-                                      (__v16sf)(__m512)(W)); })
+                                      (__v16sf)(__m512)(W))
 
-#define _mm512_maskz_permute_ps(U, X, C) __extension__ ({ \
+#define _mm512_maskz_permute_ps(U, X, C) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                       (__v16sf)_mm512_permute_ps((X), (C)), \
-                                      (__v16sf)_mm512_setzero_ps()); })
+                                      (__v16sf)_mm512_setzero_ps())
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_permutevar_pd (__m512d __A, __m512i __C)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_permutevar_pd(__m512d __A, __m512i __C)
 {
-  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
-              (__v8di) __C,
-              (__v8df)
-              _mm512_undefined_pd (),
-              (__mmask8) -1);
+  return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
 {
-  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
-              (__v8di) __C,
-              (__v8df) __W,
-              (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                         (__v8df)_mm512_permutevar_pd(__A, __C),
+                                         (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
 {
-  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
-              (__v8di) __C,
-              (__v8df)
-              _mm512_setzero_pd (),
-              (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                         (__v8df)_mm512_permutevar_pd(__A, __C),
+                                         (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_permutevar_ps (__m512 __A, __m512i __C)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_permutevar_ps(__m512 __A, __m512i __C)
 {
-  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
-                   (__v16si) __C,
-                   (__v16sf)
-                   _mm512_undefined_ps (),
-                   (__mmask16) -1);
+  return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
 {
-  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
-                   (__v16si) __C,
-                   (__v16sf) __W,
-                   (__mmask16) __U);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
+                                        (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
 {
-  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
-                   (__v16si) __C,
-                   (__v16sf)
-                   _mm512_setzero_ps (),
-                   (__mmask16) __U);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
+                                        (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS
+static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
-                    /* idx */ ,
-                    (__v8df) __A,
-                    (__v8df) __B,
-                    (__mmask8) -1);
+  return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
+                                                 (__v8df)__B);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
-                    /* idx */ ,
-                    (__v8df) __A,
-                    (__v8df) __B,
-                    (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
+                                  (__v8df)__A);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
-            __m512d __B)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
+                             __m512d __B)
 {
-  return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
-                                                         /* idx */ ,
-                                                         (__v8df) __A,
-                                                         (__v8df) __B,
-                                                         (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
+                                  (__v8df)(__m512d)__I);
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
+                             __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512(__U,
+                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
+                                  (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
 {
-  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
-                                                         /* idx */ ,
-                                                         (__v16sf) __A,
-                                                         (__v16sf) __B,
-                                                         (__mmask16) -1);
+  return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
+                                                (__v16sf) __B);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
 {
-  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
-                                                         /* idx */ ,
-                                                         (__v16sf) __A,
-                                                         (__v16sf) __B,
-                                                         (__mmask16) __U);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
+                                 (__v16sf)__A);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
-            __m512 __B)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
 {
-  return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
-                                                        /* idx */ ,
-                                                        (__v16sf) __A,
-                                                        (__v16sf) __B,
-                                                        (__mmask16) __U);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
+                                 (__v16sf)(__m512)__I);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
 {
-  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
-             (__v16si) __B,
-             (__mmask16) -1);
+  return (__m512)__builtin_ia32_selectps_512(__U,
+                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
+                                 (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
-             (__v16si) __B, __U);
-}
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
-{
-  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
-            (__v8di) __B,
-            (__mmask8) -1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
-            (__v8di) __B, __U);
-}
-
-#define _mm512_cvtt_roundpd_epu32(A, R) __extension__ ({ \
+#define _mm512_cvtt_roundpd_epu32(A, R) \
   (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
                                              (__v8si)_mm256_undefined_si256(), \
-                                             (__mmask8)-1, (int)(R)); })
+                                             (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
   (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
                                              (__v8si)(__m256i)(W), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
   (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
                                              (__v8si)_mm256_setzero_si256(), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvttpd_epu32 (__m512d __A)
 {
   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
@@ -6730,7 +6278,7 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
 {
   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
@@ -6739,7 +6287,7 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
 {
   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
@@ -6749,109 +6297,109 @@
                   _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_roundscale_round_sd(A, B, imm, R) __extension__ ({ \
+#define _mm_roundscale_round_sd(A, B, imm, R) \
   (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)_mm_setzero_pd(), \
                                                 (__mmask8)-1, (int)(imm), \
-                                                (int)(R)); })
+                                                (int)(R))
 
-#define _mm_roundscale_sd(A, B, imm) __extension__ ({ \
+#define _mm_roundscale_sd(A, B, imm) \
   (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)_mm_setzero_pd(), \
                                                 (__mmask8)-1, (int)(imm), \
-                                                _MM_FROUND_CUR_DIRECTION); })
+                                                _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_roundscale_sd(W, U, A, B, imm) __extension__ ({ \
+#define _mm_mask_roundscale_sd(W, U, A, B, imm) \
   (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)(__m128d)(W), \
                                                 (__mmask8)(U), (int)(imm), \
-                                                _MM_FROUND_CUR_DIRECTION); })
+                                                _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) __extension__ ({ \
+#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
   (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)(__m128d)(W), \
                                                 (__mmask8)(U), (int)(I), \
-                                                (int)(R)); })
+                                                (int)(R))
 
-#define _mm_maskz_roundscale_sd(U, A, B, I) __extension__ ({ \
+#define _mm_maskz_roundscale_sd(U, A, B, I) \
   (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)_mm_setzero_pd(), \
                                                 (__mmask8)(U), (int)(I), \
-                                                _MM_FROUND_CUR_DIRECTION); })
+                                                _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) __extension__ ({ \
+#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
   (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)_mm_setzero_pd(), \
                                                 (__mmask8)(U), (int)(I), \
-                                                (int)(R)); })
+                                                (int)(R))
 
-#define _mm_roundscale_round_ss(A, B, imm, R) __extension__ ({ \
+#define _mm_roundscale_round_ss(A, B, imm, R) \
   (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8)-1, (int)(imm), \
-                                               (int)(R)); })
+                                               (int)(R))
 
-#define _mm_roundscale_ss(A, B, imm) __extension__ ({ \
+#define _mm_roundscale_ss(A, B, imm) \
   (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8)-1, (int)(imm), \
-                                               _MM_FROUND_CUR_DIRECTION); })
+                                               _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_roundscale_ss(W, U, A, B, I) __extension__ ({ \
+#define _mm_mask_roundscale_ss(W, U, A, B, I) \
   (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)(__m128)(W), \
                                                (__mmask8)(U), (int)(I), \
-                                               _MM_FROUND_CUR_DIRECTION); })
+                                               _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) __extension__ ({ \
+#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
   (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)(__m128)(W), \
                                                (__mmask8)(U), (int)(I), \
-                                               (int)(R)); })
+                                               (int)(R))
 
-#define _mm_maskz_roundscale_ss(U, A, B, I) __extension__ ({ \
+#define _mm_maskz_roundscale_ss(U, A, B, I) \
   (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8)(U), (int)(I), \
-                                               _MM_FROUND_CUR_DIRECTION); })
+                                               _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) __extension__ ({ \
+#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
   (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8)(U), (int)(I), \
-                                               (int)(R)); })
+                                               (int)(R))
 
-#define _mm512_scalef_round_pd(A, B, R) __extension__ ({ \
+#define _mm512_scalef_round_pd(A, B, R) \
   (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(B), \
                                            (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_scalef_round_pd(W, U, A, B, R) __extension__ ({ \
+#define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
   (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(B), \
                                            (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_scalef_round_pd(U, A, B, R) __extension__ ({ \
+#define _mm512_maskz_scalef_round_pd(U, A, B, R) \
   (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_scalef_pd (__m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
@@ -6862,7 +6410,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
@@ -6872,7 +6420,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
@@ -6883,25 +6431,25 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_scalef_round_ps(A, B, R) __extension__ ({ \
+#define _mm512_scalef_round_ps(A, B, R) \
   (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(B), \
                                           (__v16sf)_mm512_undefined_ps(), \
-                                          (__mmask16)-1, (int)(R)); })
+                                          (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_scalef_round_ps(W, U, A, B, R) __extension__ ({ \
+#define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
   (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(B), \
                                           (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_scalef_round_ps(U, A, B, R) __extension__ ({ \
+#define _mm512_maskz_scalef_round_ps(U, A, B, R) \
   (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_scalef_ps (__m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
@@ -6912,7 +6460,7 @@
                _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
@@ -6922,7 +6470,7 @@
                _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
@@ -6933,13 +6481,13 @@
                _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_scalef_round_sd(A, B, R) __extension__ ({ \
+#define _mm_scalef_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
                                               (__v2df)(__m128d)(B), \
                                               (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1, (int)(R)); })
+                                              (__mmask8)-1, (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_scalef_sd (__m128d __A, __m128d __B)
 {
   return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
@@ -6948,7 +6496,7 @@
               _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
@@ -6958,13 +6506,13 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_scalef_round_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_scalef_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
                                               (__v2df)(__m128d)(B), \
                                               (__v2df)(__m128d)(W), \
-                                              (__mmask8)(U), (int)(R)); })
+                                              (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
@@ -6974,19 +6522,19 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_scalef_round_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_scalef_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
                                               (__v2df)(__m128d)(B), \
                                               (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(U), (int)(R)); })
+                                              (__mmask8)(U), (int)(R))
 
-#define _mm_scalef_round_ss(A, B, R) __extension__ ({ \
+#define _mm_scalef_round_ss(A, B, R) \
   (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
                                              (__v4sf)(__m128)(B), \
                                              (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1, (int)(R)); })
+                                             (__mmask8)-1, (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_scalef_ss (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
@@ -6995,7 +6543,7 @@
              _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
@@ -7005,13 +6553,13 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_scalef_round_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_scalef_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
                                              (__v4sf)(__m128)(B), \
                                              (__v4sf)(__m128)(W), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
@@ -7021,174 +6569,147 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_scalef_round_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_scalef_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
                                              (__v4sf)(__m128)(B), \
                                              (__v4sf)_mm_setzero_ps(), \
                                              (__mmask8)(U), \
-                                             _MM_FROUND_CUR_DIRECTION); })
+                                             (int)(R))
 
-#define _mm512_srai_epi32(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srai_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+}
 
-#define _mm512_mask_srai_epi32(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srai_epi32(__A, __B),
+                                         (__v16si)__W);
+}
 
-#define _mm512_maskz_srai_epi32(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srai_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
 
-#define _mm512_srai_epi64(A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srai_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+}
 
-#define _mm512_mask_srai_epi64(W, U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)(__m512i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srai_epi64(__A, __B),
+                                          (__v8di)__W);
+}
 
-#define _mm512_maskz_srai_epi64(U, A, B) __extension__ ({ \
-  (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
-                                         (__v8di)_mm512_setzero_si512(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srai_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
 
-#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(imm), \
-                                         (__v16sf)_mm512_undefined_ps(), \
-                                         (__mmask16)-1); })
+#define _mm512_shuffle_f32x4(A, B, imm) \
+  (__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
+                                    (__v16sf)(__m512)(B), (int)(imm))
 
-#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(imm), \
-                                         (__v16sf)(__m512)(W), \
-                                         (__mmask16)(U)); })
+#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
+                                      (__v16sf)(__m512)(W))
 
-#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(imm), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(U)); })
+#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
+                                      (__v16sf)_mm512_setzero_ps())
 
-#define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(imm), \
-                                          (__v8df)_mm512_undefined_pd(), \
-                                          (__mmask8)-1); })
+#define _mm512_shuffle_f64x2(A, B, imm) \
+  (__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
+                                     (__v8df)(__m512d)(B), (int)(imm))
 
-#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(imm), \
-                                          (__v8df)(__m512d)(W), \
-                                          (__mmask8)(U)); })
+#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
+                                       (__v8df)(__m512d)(W))
 
-#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(imm), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(U)); })
+#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
+                                       (__v8df)_mm512_setzero_pd())
 
-#define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
-                                          (__v16si)(__m512i)(B), (int)(imm), \
-                                          (__v16si)_mm512_setzero_si512(), \
-                                          (__mmask16)-1); })
+#define _mm512_shuffle_i32x4(A, B, imm) \
+  (__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
+                                     (__v16si)(__m512i)(B), (int)(imm))
 
-#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
-                                          (__v16si)(__m512i)(B), (int)(imm), \
-                                          (__v16si)(__m512i)(W), \
-                                          (__mmask16)(U)); })
+#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
+                                      (__v16si)(__m512i)(W))
 
-#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
-                                          (__v16si)(__m512i)(B), (int)(imm), \
-                                          (__v16si)_mm512_setzero_si512(), \
-                                          (__mmask16)(U)); })
+#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
+                                      (__v16si)_mm512_setzero_si512())
 
-#define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
-                                          (__v8di)(__m512i)(B), (int)(imm), \
-                                          (__v8di)_mm512_setzero_si512(), \
-                                          (__mmask8)-1); })
+#define _mm512_shuffle_i64x2(A, B, imm) \
+  (__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
+                                     (__v8di)(__m512i)(B), (int)(imm))
 
-#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
-                                          (__v8di)(__m512i)(B), (int)(imm), \
-                                          (__v8di)(__m512i)(W), \
-                                          (__mmask8)(U)); })
+#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
+                                      (__v8di)(__m512i)(W))
 
-#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
-                                          (__v8di)(__m512i)(B), (int)(imm), \
-                                          (__v8di)_mm512_setzero_si512(), \
-                                          (__mmask8)(U)); })
+#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
+                                      (__v8di)_mm512_setzero_si512())
 
-#define _mm512_shuffle_pd(A, B, M) __extension__ ({ \
-  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
-                                   (__v8df)(__m512d)(B), \
-                                   0  + (((M) >> 0) & 0x1), \
-                                   8  + (((M) >> 1) & 0x1), \
-                                   2  + (((M) >> 2) & 0x1), \
-                                   10 + (((M) >> 3) & 0x1), \
-                                   4  + (((M) >> 4) & 0x1), \
-                                   12 + (((M) >> 5) & 0x1), \
-                                   6  + (((M) >> 6) & 0x1), \
-                                   14 + (((M) >> 7) & 0x1)); })
+#define _mm512_shuffle_pd(A, B, M) \
+  (__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
+                                    (__v8df)(__m512d)(B), (int)(M))
 
-#define _mm512_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+#define _mm512_mask_shuffle_pd(W, U, A, B, M) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                        (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
-                                       (__v8df)(__m512d)(W)); })
+                                       (__v8df)(__m512d)(W))
 
-#define _mm512_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+#define _mm512_maskz_shuffle_pd(U, A, B, M) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                        (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
-                                       (__v8df)_mm512_setzero_pd()); })
+                                       (__v8df)_mm512_setzero_pd())
 
-#define _mm512_shuffle_ps(A, B, M) __extension__ ({ \
-  (__m512d)__builtin_shufflevector((__v16sf)(__m512)(A), \
-                                   (__v16sf)(__m512)(B), \
-                                   0  + (((M) >> 0) & 0x3), \
-                                   0  + (((M) >> 2) & 0x3), \
-                                   16 + (((M) >> 4) & 0x3), \
-                                   16 + (((M) >> 6) & 0x3), \
-                                   4  + (((M) >> 0) & 0x3), \
-                                   4  + (((M) >> 2) & 0x3), \
-                                   20 + (((M) >> 4) & 0x3), \
-                                   20 + (((M) >> 6) & 0x3), \
-                                   8  + (((M) >> 0) & 0x3), \
-                                   8  + (((M) >> 2) & 0x3), \
-                                   24 + (((M) >> 4) & 0x3), \
-                                   24 + (((M) >> 6) & 0x3), \
-                                   12 + (((M) >> 0) & 0x3), \
-                                   12 + (((M) >> 2) & 0x3), \
-                                   28 + (((M) >> 4) & 0x3), \
-                                   28 + (((M) >> 6) & 0x3)); })
+#define _mm512_shuffle_ps(A, B, M) \
+  (__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), (int)(M))
 
-#define _mm512_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+#define _mm512_mask_shuffle_ps(W, U, A, B, M) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                       (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
-                                      (__v16sf)(__m512)(W)); })
+                                      (__v16sf)(__m512)(W))
 
-#define _mm512_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+#define _mm512_maskz_shuffle_ps(U, A, B, M) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                       (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
-                                      (__v16sf)_mm512_setzero_ps()); })
+                                      (__v16sf)_mm512_setzero_ps())
 
-#define _mm_sqrt_round_sd(A, B, R) __extension__ ({ \
+#define _mm_sqrt_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
                                             (__v2df)(__m128d)(B), \
                                             (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
@@ -7198,13 +6719,13 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_sqrt_round_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
                                             (__v2df)(__m128d)(B), \
                                             (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
 {
  return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
@@ -7214,19 +6735,19 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_sqrt_round_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_sqrt_round_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
                                             (__v2df)(__m128d)(B), \
                                             (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm_sqrt_round_ss(A, B, R) __extension__ ({ \
+#define _mm_sqrt_round_ss(A, B, R) \
   (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
                                            (__v4sf)(__m128)(B), \
                                            (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
@@ -7236,13 +6757,13 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_sqrt_round_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
                                            (__v4sf)(__m128)(B), \
                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                           (int)(R)); })
+                                           (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
 {
  return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
@@ -7252,117 +6773,107 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_sqrt_round_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_sqrt_round_ss(U, A, B, R) \
   (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
                                            (__v4sf)(__m128)(B), \
                                            (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_broadcast_f32x4 (__m128 __A)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_broadcast_f32x4(__m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
-                 (__v16sf)
-                 _mm512_undefined_ps (),
-                 (__mmask16) -1);
+  return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+                                         0, 1, 2, 3, 0, 1, 2, 3,
+                                         0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
-                 (__v16sf) __O,
-                 __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                           (__v16sf)_mm512_broadcast_f32x4(__A),
+                                           (__v16sf)__O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
-                 (__v16sf)
-                 _mm512_setzero_ps (),
-                 __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                           (__v16sf)_mm512_broadcast_f32x4(__A),
+                                           (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_broadcast_f64x4 (__m256d __A)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_broadcast_f64x4(__m256d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
-                  (__v8df)
-                  _mm512_undefined_pd (),
-                  (__mmask8) -1);
+  return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
+                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
-                  (__v8df) __O,
-                  __M);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+                                            (__v8df)_mm512_broadcast_f64x4(__A),
+                                            (__v8df)__O);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
-                  (__v8df)
-                  _mm512_setzero_pd (),
-                  __M);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+                                            (__v8df)_mm512_broadcast_f64x4(__A),
+                                            (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcast_i32x4 (__m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcast_i32x4(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
-                  (__v16si)
-                  _mm512_undefined_epi32 (),
-                  (__mmask16) -1);
+  return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+                                          0, 1, 2, 3, 0, 1, 2, 3,
+                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
-                  (__v16si) __O,
-                  __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                           (__v16si)_mm512_broadcast_i32x4(__A),
+                                           (__v16si)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
-                  (__v16si)
-                  _mm512_setzero_si512 (),
-                  __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                           (__v16si)_mm512_broadcast_i32x4(__A),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcast_i64x4 (__m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcast_i64x4(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
-                  (__v8di)
-                  _mm512_undefined_epi32 (),
-                  (__mmask8) -1);
+  return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
+                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
-                  (__v8di) __O,
-                  __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                            (__v8di)_mm512_broadcast_i64x4(__A),
+                                            (__v8di)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
-                  (__v8di)
-                  _mm512_setzero_si512 (),
-                  __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                            (__v8di)_mm512_broadcast_i64x4(__A),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
 {
   return (__m512d)__builtin_ia32_selectpd_512(__M,
@@ -7370,7 +6881,7 @@
                                               (__v8df) __O);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
 {
   return (__m512d)__builtin_ia32_selectpd_512(__M,
@@ -7378,7 +6889,7 @@
                                               (__v8df) _mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
 {
   return (__m512)__builtin_ia32_selectps_512(__M,
@@ -7386,7 +6897,7 @@
                                              (__v16sf) __O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
 {
   return (__m512)__builtin_ia32_selectps_512(__M,
@@ -7394,7 +6905,7 @@
                                              (__v16sf) _mm512_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtsepi32_epi8 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
@@ -7402,14 +6913,14 @@
                (__mmask16) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
                (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
@@ -7417,13 +6928,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
 {
   __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtsepi32_epi16 (__m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
@@ -7431,14 +6942,14 @@
                (__mmask16) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
                (__v16hi) __O, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
@@ -7446,13 +6957,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
 {
   __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtsepi64_epi8 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
@@ -7460,14 +6971,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
                (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
@@ -7475,29 +6986,28 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtsepi64_epi32 (__m512i __A)
 {
-  __v8si __O;
   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
                (__v8si) _mm256_undefined_si256 (),
                (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
                (__v8si) __O, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
@@ -7505,13 +7015,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtsepi64_epi16 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
@@ -7519,14 +7029,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
                (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
@@ -7534,13 +7044,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtusepi32_epi8 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
@@ -7548,7 +7058,7 @@
                 (__mmask16) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
@@ -7556,7 +7066,7 @@
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
@@ -7564,13 +7074,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
 {
   __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtusepi32_epi16 (__m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
@@ -7578,7 +7088,7 @@
                 (__mmask16) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
@@ -7586,7 +7096,7 @@
                 __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
@@ -7594,13 +7104,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
 {
   __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtusepi64_epi8 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
@@ -7608,7 +7118,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
@@ -7616,7 +7126,7 @@
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
@@ -7624,13 +7134,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtusepi64_epi32 (__m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
@@ -7638,14 +7148,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
                 (__v8si) __O, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
@@ -7653,13 +7163,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtusepi64_epi16 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
@@ -7667,14 +7177,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
                 (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
@@ -7682,13 +7192,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtepi32_epi8 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
@@ -7696,14 +7206,14 @@
               (__mmask16) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
               (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
@@ -7711,13 +7221,13 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
 {
   __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtepi32_epi16 (__m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
@@ -7725,14 +7235,14 @@
               (__mmask16) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
               (__v16hi) __O, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
@@ -7740,13 +7250,13 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
 {
   __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtepi64_epi8 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
@@ -7754,14 +7264,14 @@
               (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
               (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
@@ -7769,13 +7279,13 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtepi64_epi32 (__m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
@@ -7783,14 +7293,14 @@
               (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
               (__v8si) __O, __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
 {
   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
@@ -7798,13 +7308,13 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_cvtepi64_epi16 (__m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
@@ -7812,14 +7322,14 @@
               (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
               (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
 {
   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
@@ -7827,208 +7337,192 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
 {
   __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
 }
 
-#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
+#define _mm512_extracti32x4_epi32(A, imm) \
   (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
                                             (__v4si)_mm_undefined_si128(), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
+#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
   (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
                                             (__v4si)(__m128i)(W), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
+#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
   (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
                                             (__v4si)_mm_setzero_si128(), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
+#define _mm512_extracti64x4_epi64(A, imm) \
   (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
                                             (__v4di)_mm256_undefined_si256(), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
+#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
   (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
                                             (__v4di)(__m256i)(W), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
+#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
   (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
                                             (__v4di)_mm256_setzero_si256(), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(imm), \
-                                           (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1); })
+#define _mm512_insertf64x4(A, B, imm) \
+  (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
+                                      (__v4df)(__m256d)(B), (int)(imm))
 
-#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(imm), \
-                                           (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U)); })
+#define _mm512_mask_insertf64x4(W, U, A, B, imm) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                  (__v8df)(__m512d)(W))
 
-#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
-  (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(imm), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U)); })
+#define _mm512_maskz_insertf64x4(U, A, B, imm) \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                  (__v8df)_mm512_setzero_pd())
 
-#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
-                                           (__v4di)(__m256i)(B), (int)(imm), \
-                                           (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)-1); })
+#define _mm512_inserti64x4(A, B, imm) \
+  (__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
+                                      (__v4di)(__m256i)(B), (int)(imm))
 
-#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
-                                           (__v4di)(__m256i)(B), (int)(imm), \
-                                           (__v8di)(__m512i)(W), \
-                                           (__mmask8)(U)); })
+#define _mm512_mask_inserti64x4(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                  (__v8di)(__m512i)(W))
 
-#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
-                                           (__v4di)(__m256i)(B), (int)(imm), \
-                                           (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)(U)); })
+#define _mm512_maskz_inserti64x4(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512())
 
-#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
-                                          (__v4sf)(__m128)(B), (int)(imm), \
-                                          (__v16sf)_mm512_undefined_ps(), \
-                                          (__mmask16)-1); })
+#define _mm512_insertf32x4(A, B, imm) \
+  (__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
+                                     (__v4sf)(__m128)(B), (int)(imm))
 
-#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
-                                          (__v4sf)(__m128)(B), (int)(imm), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U)); })
+#define _mm512_mask_insertf32x4(W, U, A, B, imm) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                 (__v16sf)(__m512)(W))
 
-#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
-  (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
-                                          (__v4sf)(__m128)(B), (int)(imm), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U)); })
+#define _mm512_maskz_insertf32x4(U, A, B, imm) \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                 (__v16sf)_mm512_setzero_ps())
 
-#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
-                                           (__v4si)(__m128i)(B), (int)(imm), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)-1); })
+#define _mm512_inserti32x4(A, B, imm) \
+  (__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
+                                      (__v4si)(__m128i)(B), (int)(imm))
 
-#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
-                                           (__v4si)(__m128i)(B), (int)(imm), \
-                                           (__v16si)(__m512i)(W), \
-                                           (__mmask16)(U)); })
+#define _mm512_mask_inserti32x4(W, U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                 (__v16si)(__m512i)(W))
 
-#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
-  (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
-                                           (__v4si)(__m128i)(B), (int)(imm), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)(U)); })
+#define _mm512_maskz_inserti32x4(U, A, B, imm) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512())
 
-#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
+#define _mm512_getmant_round_pd(A, B, C, R) \
   (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v8df)_mm512_undefined_pd(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) __extension__ ({ \
+#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
   (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
   (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)); })
+                                            (__mmask8)(U), (int)(R))
 
-#define _mm512_getmant_pd(A, B, C) __extension__ ({ \
+#define _mm512_getmant_pd(A, B, C) \
   (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v8df)_mm512_setzero_pd(), \
                                             (__mmask8)-1, \
-                                            _MM_FROUND_CUR_DIRECTION); })
+                                            _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
+#define _mm512_mask_getmant_pd(W, U, A, B, C) \
   (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v8df)(__m512d)(W), \
                                             (__mmask8)(U), \
-                                            _MM_FROUND_CUR_DIRECTION); })
+                                            _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
+#define _mm512_maskz_getmant_pd(U, A, B, C) \
   (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v8df)_mm512_setzero_pd(), \
                                             (__mmask8)(U), \
-                                            _MM_FROUND_CUR_DIRECTION); })
+                                            _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_getmant_round_ps(A, B, C, R) __extension__ ({ \
+#define _mm512_getmant_round_ps(A, B, C, R) \
   (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v16sf)_mm512_undefined_ps(), \
-                                           (__mmask16)-1, (int)(R)); })
+                                           (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) __extension__ ({ \
+#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
   (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) __extension__ ({ \
+#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
   (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R)); })
+                                           (__mmask16)(U), (int)(R))
 
-#define _mm512_getmant_ps(A, B, C) __extension__ ({ \
+#define _mm512_getmant_ps(A, B, C) \
   (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
                                            (int)(((C)<<2)|(B)), \
                                            (__v16sf)_mm512_undefined_ps(), \
                                            (__mmask16)-1, \
-                                           _MM_FROUND_CUR_DIRECTION); })
+                                           _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+#define _mm512_mask_getmant_ps(W, U, A, B, C) \
   (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
                                            (int)(((C)<<2)|(B)), \
                                            (__v16sf)(__m512)(W), \
                                            (__mmask16)(U), \
-                                           _MM_FROUND_CUR_DIRECTION); })
+                                           _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+#define _mm512_maskz_getmant_ps(U, A, B, C) \
   (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
                                            (int)(((C)<<2)|(B)), \
                                            (__v16sf)_mm512_setzero_ps(), \
                                            (__mmask16)(U), \
-                                           _MM_FROUND_CUR_DIRECTION); })
+                                           _MM_FROUND_CUR_DIRECTION)
 
-#define _mm512_getexp_round_pd(A, R) __extension__ ({ \
+#define _mm512_getexp_round_pd(A, R) \
   (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_getexp_round_pd(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_getexp_round_pd(W, U, A, R) \
   (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_getexp_round_pd(U, A, R) __extension__ ({ \
+#define _mm512_maskz_getexp_round_pd(U, A, R) \
   (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_getexp_pd (__m512d __A)
 {
   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
@@ -8037,7 +7531,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
@@ -8046,7 +7540,7 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
@@ -8055,22 +7549,22 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_getexp_round_ps(A, R) __extension__ ({ \
+#define _mm512_getexp_round_ps(A, R) \
   (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)_mm512_undefined_ps(), \
-                                          (__mmask16)-1, (int)(R)); })
+                                          (__mmask16)-1, (int)(R))
 
-#define _mm512_mask_getexp_round_ps(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_getexp_round_ps(W, U, A, R) \
   (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
-#define _mm512_maskz_getexp_round_ps(U, A, R) __extension__ ({ \
+#define _mm512_maskz_getexp_round_ps(U, A, R) \
   (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R)); })
+                                          (__mmask16)(U), (int)(R))
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_getexp_ps (__m512 __A)
 {
   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
@@ -8079,7 +7573,7 @@
                _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
@@ -8088,7 +7582,7 @@
                _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
@@ -8097,794 +7591,812 @@
                _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm512_i64gather_ps(index, addr, scale) __extension__ ({ \
+#define _mm512_i64gather_ps(index, addr, scale) \
   (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
                                        (float const *)(addr), \
                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                       (int)(scale)); })
+                                       (int)(scale))
 
-#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\
-                                  __addr, __scale) __extension__({\
-__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
-                              __addr,(__v8di) __index, __mask, __scale);\
-})
+#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
+  (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
+                                       (float const *)(addr), \
+                                       (__v8di)(__m512i)(index), \
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
-  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
+#define _mm512_i64gather_epi32(index, addr, scale) \
+  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
                                         (int const *)(addr), \
                                         (__v8di)(__m512i)(index), \
-                                        (__mmask8)-1, (int)(scale)); })
+                                        (__mmask8)-1, (int)(scale))
 
-#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
   (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
                                         (int const *)(addr), \
                                         (__v8di)(__m512i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm512_i64gather_pd(index, addr, scale) __extension__ ({\
+#define _mm512_i64gather_pd(index, addr, scale) \
   (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
                                        (double const *)(addr), \
                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                       (int)(scale)); })
+                                       (int)(scale))
 
-#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
   (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
                                        (double const *)(addr), \
                                        (__v8di)(__m512i)(index), \
-                                       (__mmask8)(mask), (int)(scale)); })
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm512_i64gather_epi64(index, addr, scale) __extension__ ({\
-  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_pd(), \
+#define _mm512_i64gather_epi64(index, addr, scale) \
+  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
                                        (long long const *)(addr), \
                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                       (int)(scale)); })
+                                       (int)(scale))
 
-#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
   (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
                                        (long long const *)(addr), \
                                        (__v8di)(__m512i)(index), \
-                                       (__mmask8)(mask), (int)(scale)); })
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm512_i32gather_ps(index, addr, scale) __extension__ ({\
+#define _mm512_i32gather_ps(index, addr, scale) \
   (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
                                        (float const *)(addr), \
                                        (__v16sf)(__m512)(index), \
-                                       (__mmask16)-1, (int)(scale)); })
+                                       (__mmask16)-1, (int)(scale))
 
-#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
   (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
                                        (float const *)(addr), \
                                        (__v16sf)(__m512)(index), \
-                                       (__mmask16)(mask), (int)(scale)); })
+                                       (__mmask16)(mask), (int)(scale))
 
-#define _mm512_i32gather_epi32(index, addr, scale) __extension__ ({\
+#define _mm512_i32gather_epi32(index, addr, scale) \
   (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
                                         (int const *)(addr), \
                                         (__v16si)(__m512i)(index), \
-                                        (__mmask16)-1, (int)(scale)); })
+                                        (__mmask16)-1, (int)(scale))
 
-#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
   (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
                                         (int const *)(addr), \
                                         (__v16si)(__m512i)(index), \
-                                        (__mmask16)(mask), (int)(scale)); })
+                                        (__mmask16)(mask), (int)(scale))
 
-#define _mm512_i32gather_pd(index, addr, scale) __extension__ ({\
+#define _mm512_i32gather_pd(index, addr, scale) \
   (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
                                        (double const *)(addr), \
                                        (__v8si)(__m256i)(index), (__mmask8)-1, \
-                                       (int)(scale)); })
+                                       (int)(scale))
 
-#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
   (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
                                        (double const *)(addr), \
                                        (__v8si)(__m256i)(index), \
-                                       (__mmask8)(mask), (int)(scale)); })
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm512_i32gather_epi64(index, addr, scale) __extension__ ({\
+#define _mm512_i32gather_epi64(index, addr, scale) \
   (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
                                        (long long const *)(addr), \
                                        (__v8si)(__m256i)(index), (__mmask8)-1, \
-                                       (int)(scale)); })
+                                       (int)(scale))
 
-#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
   (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
                                        (long long const *)(addr), \
                                        (__v8si)(__m256i)(index), \
-                                       (__mmask8)(mask), (int)(scale)); })
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm512_i64scatter_ps(addr, index, v1, scale) __extension__ ({\
+#define _mm512_i64scatter_ps(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)-1, \
                                 (__v8di)(__m512i)(index), \
-                                (__v8sf)(__m256)(v1), (int)(scale)); })
+                                (__v8sf)(__m256)(v1), (int)(scale))
 
-#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
+#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)(mask), \
                                 (__v8di)(__m512i)(index), \
-                                (__v8sf)(__m256)(v1), (int)(scale)); })
+                                (__v8sf)(__m256)(v1), (int)(scale))
 
-#define _mm512_i64scatter_epi32(addr, index, v1, scale) __extension__ ({\
+#define _mm512_i64scatter_epi32(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)-1, \
                                 (__v8di)(__m512i)(index), \
-                                (__v8si)(__m256i)(v1), (int)(scale)); })
+                                (__v8si)(__m256i)(v1), (int)(scale))
 
-#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
+#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)(mask), \
                                 (__v8di)(__m512i)(index), \
-                                (__v8si)(__m256i)(v1), (int)(scale)); })
+                                (__v8si)(__m256i)(v1), (int)(scale))
 
-#define _mm512_i64scatter_pd(addr, index, v1, scale) __extension__ ({\
+#define _mm512_i64scatter_pd(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)-1, \
                                (__v8di)(__m512i)(index), \
-                               (__v8df)(__m512d)(v1), (int)(scale)); })
+                               (__v8df)(__m512d)(v1), (int)(scale))
 
-#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
+#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)(mask), \
                                (__v8di)(__m512i)(index), \
-                               (__v8df)(__m512d)(v1), (int)(scale)); })
+                               (__v8df)(__m512d)(v1), (int)(scale))
 
-#define _mm512_i64scatter_epi64(addr, index, v1, scale) __extension__ ({\
+#define _mm512_i64scatter_epi64(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)-1, \
                                (__v8di)(__m512i)(index), \
-                               (__v8di)(__m512i)(v1), (int)(scale)); })
+                               (__v8di)(__m512i)(v1), (int)(scale))
 
-#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
+#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)(mask), \
                                (__v8di)(__m512i)(index), \
-                               (__v8di)(__m512i)(v1), (int)(scale)); })
+                               (__v8di)(__m512i)(v1), (int)(scale))
 
-#define _mm512_i32scatter_ps(addr, index, v1, scale) __extension__ ({\
+#define _mm512_i32scatter_ps(addr, index, v1, scale) \
   __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)-1, \
                                 (__v16si)(__m512i)(index), \
-                                (__v16sf)(__m512)(v1), (int)(scale)); })
+                                (__v16sf)(__m512)(v1), (int)(scale))
 
-#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
+#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
   __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)(mask), \
                                 (__v16si)(__m512i)(index), \
-                                (__v16sf)(__m512)(v1), (int)(scale)); })
+                                (__v16sf)(__m512)(v1), (int)(scale))
 
-#define _mm512_i32scatter_epi32(addr, index, v1, scale) __extension__ ({\
+#define _mm512_i32scatter_epi32(addr, index, v1, scale) \
   __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)-1, \
                                 (__v16si)(__m512i)(index), \
-                                (__v16si)(__m512i)(v1), (int)(scale)); })
+                                (__v16si)(__m512i)(v1), (int)(scale))
 
-#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
+#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
   __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)(mask), \
                                 (__v16si)(__m512i)(index), \
-                                (__v16si)(__m512i)(v1), (int)(scale)); })
+                                (__v16si)(__m512i)(v1), (int)(scale))
 
-#define _mm512_i32scatter_pd(addr, index, v1, scale) __extension__ ({\
+#define _mm512_i32scatter_pd(addr, index, v1, scale) \
   __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)-1, \
                                (__v8si)(__m256i)(index), \
-                               (__v8df)(__m512d)(v1), (int)(scale)); })
+                               (__v8df)(__m512d)(v1), (int)(scale))
 
-#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
+#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
   __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)(mask), \
                                (__v8si)(__m256i)(index), \
-                               (__v8df)(__m512d)(v1), (int)(scale)); })
+                               (__v8df)(__m512d)(v1), (int)(scale))
 
-#define _mm512_i32scatter_epi64(addr, index, v1, scale) __extension__ ({\
+#define _mm512_i32scatter_epi64(addr, index, v1, scale) \
   __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)-1, \
                                (__v8si)(__m256i)(index), \
-                               (__v8di)(__m512i)(v1), (int)(scale)); })
+                               (__v8di)(__m512i)(v1), (int)(scale))
 
-#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
+#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
   __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)(mask), \
                                (__v8si)(__m256i)(index), \
-                               (__v8di)(__m512i)(v1), (int)(scale)); })
+                               (__v8di)(__m512i)(v1), (int)(scale))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
+                                       (__v4sf)__A,
+                                       (__v4sf)__B,
+                                       (__mmask8)__U,
+                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
+#define _mm_fmadd_round_ss(A, B, C, R) \
   (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
                                         (__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+                                        (__v4sf)(__m128)(C), (__mmask8)-1, \
+                                        (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+#define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        (__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
- return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
+                                        (__v4sf)__B,
+                                        (__v4sf)__C,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\
+#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
   (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), \
                                          (__v4sf)(__m128)(C), (__mmask8)(U), \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
-          (__v4sf) __X,
-          (__v4sf) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
+                                        (__v4sf)__X,
+                                        (__v4sf)__Y,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\
+#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
   (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
                                          (__v4sf)(__m128)(X), \
                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                         (int)(R)); })
+                                         (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
-          -(__v4sf) __B,
-          (__v4sf) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
+                                       (__v4sf)__A,
+                                       -(__v4sf)__B,
+                                       (__mmask8)__U,
+                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
+#define _mm_fmsub_round_ss(A, B, C, R) \
   (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                        -(__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+                                        (__v4sf)(__m128)(B), \
+                                        -(__v4sf)(__m128)(C), (__mmask8)-1, \
+                                        (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+#define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        (__v4sf)(__m128)(A), \
+                                        -(__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
- return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
-          (__v4sf) __B,
-          -(__v4sf) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
+                                        (__v4sf)__B,
+                                        -(__v4sf)__C,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\
+#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
   (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), \
                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
-                                         (int)(R)); })
+                                         (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
-          (__v4sf) __X,
-          -(__v4sf) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
+                                        (__v4sf)__X,
+                                        (__v4sf)__Y,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
-                                         (__v4sf)(__m128)(X), \
-                                         -(__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                         (int)(R)); })
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                        (int)(R)); })
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
-                                         (int)(R)); })
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
- return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
-          (__v4sf) __X,
-          (__v4sf) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\
-  (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
+#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
+  (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
                                          (__v4sf)(__m128)(X), \
                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                         (int)(R)); })
+                                         (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
+                                       -(__v4sf)__A,
+                                       (__v4sf)__B,
+                                       (__mmask8)__U,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmadd_round_ss(A, B, C, R) \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+                                        -(__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(C), (__mmask8)-1, \
+                                        (int)(R))
+
+#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        -(__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
+                                        -(__v4sf)__B,
+                                        (__v4sf)__C,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         -(__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         (int)(R))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+  return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
+                                        -(__v4sf)__X,
+                                        (__v4sf)__Y,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
+  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+                                         -(__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
-          -(__v4sf) __B,
-          (__v4sf) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
+                                       -(__v4sf)__A,
+                                       -(__v4sf)__B,
+                                       (__mmask8)__U,
+                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
+#define _mm_fnmsub_round_ss(A, B, C, R) \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
                                         -(__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                        (int)(R)); })
+                                        -(__v4sf)(__m128)(C), (__mmask8)-1, \
+                                        (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        -(__v4sf)(__m128)(A), \
+                                        -(__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
- return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
-          (__v4sf) __B,
-          -(__v4sf) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
+                                        -(__v4sf)__B,
+                                        -(__v4sf)__C,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
+#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         -(__v4sf)(__m128)(B), \
                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
-                                         _MM_FROUND_CUR_DIRECTION); })
+                                         (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
-          (__v4sf) __X,
-          -(__v4sf) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
+                                        -(__v4sf)__X,
+                                        (__v4sf)__Y,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
-  (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
-                                         (__v4sf)(__m128)(X), \
-                                         -(__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                         (int)(R)); })
+#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
+  (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
+                                         -(__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
+                                       (__v2df)__A,
+                                       (__v2df)__B,
+                                       (__mmask8)__U,
+                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
+#define _mm_fmadd_round_sd(A, B, C, R) \
   (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
                                          (__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                         (int)(R)); })
+                                         (__v2df)(__m128d)(C), (__mmask8)-1, \
+                                         (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+#define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
+                                        (__v2df)__B,
+                                        (__v2df)__C,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\
+#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
   (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), \
                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
-          (__v2df) __X,
-          (__v2df) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
+                                        (__v2df)__X,
+                                        (__v2df)__Y,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\
+#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
   (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
                                           (__v2df)(__m128d)(X), \
                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
-          -(__v2df) __B,
-          (__v2df) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
+                                       (__v2df)__A,
+                                       -(__v2df)__B,
+                                       (__mmask8)__U,
+                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
+#define _mm_fmsub_round_sd(A, B, C, R) \
   (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                         -(__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                         (int)(R)); })
+                                         (__v2df)(__m128d)(B), \
+                                         -(__v2df)(__m128d)(C), (__mmask8)-1, \
+                                         (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+#define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
-          (__v2df) __B,
-          -(__v2df) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
+                                        (__v2df)__B,
+                                        -(__v2df)__C,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\
+#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
   (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), \
                                           -(__v2df)(__m128d)(C), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
-          (__v2df) __X,
-          -(__v2df) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
+                                        (__v2df)__X,
+                                        (__v2df)__Y,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
+  (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
                                           (__v2df)(__m128d)(X), \
-                                          -(__v2df)(__m128d)(Y), \
-                                          (__mmask8)(U), (int)(R)); })
+                                          (__v2df)(__m128d)(Y), \
+                                          (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
+                                       -(__v2df)__A,
+                                       (__v2df)__B,
+                                       (__mmask8)__U,
+                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                         (int)(R)); })
+#define _mm_fnmadd_round_sd(A, B, C, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(C), (__mmask8)-1, \
+                                         (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         -(__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
+                                        -(__v2df)__B,
+                                        (__v2df)__C,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
+#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          -(__v2df)(__m128d)(B), \
                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W,
-          (__v2df) __X,
-          (__v2df) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
+                                        -(__v2df)__X,
+                                        (__v2df)__Y,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
-                                          (__v2df)(__m128d)(X), \
+#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+                                          -(__v2df)(__m128d)(X), \
                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
-                                          (int)(R)); })
+                                          (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
-          -(__v2df) __B,
-          (__v2df) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
+                                       -(__v2df)__A,
+                                       -(__v2df)__B,
+                                       (__mmask8)__U,
+                                       _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
+#define _mm_fnmsub_round_sd(A, B, C, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
                                          -(__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                         (int)(R)); })
+                                         -(__v2df)(__m128d)(C), (__mmask8)-1, \
+                                         (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         -(__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
-          (__v2df) __B,
-          -(__v2df) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
+                                        -(__v2df)__B,
+                                        -(__v2df)__C,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
+#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          -(__v2df)(__m128d)(B), \
                                           -(__v2df)(__m128d)(C), \
                                           (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION); })
+                                          (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) (__W),
-          (__v2df) __X,
-          -(__v2df) (__Y),
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
+                                        -(__v2df)__X,
+                                        (__v2df)__Y,
+                                        (__mmask8)__U,
+                                        _MM_FROUND_CUR_DIRECTION);
 }
 
-#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
-                                          (__v2df)(__m128d)(X), \
-                                          -(__v2df)(__m128d)(Y), \
-                                          (__mmask8)(U), (int)(R)); })
+#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
+  (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
+                                          -(__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), \
+                                          (__mmask8)(U), (int)(R))
 
-#define _mm512_permutex_pd(X, C) __extension__ ({ \
-  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
-                                   (__v8df)_mm512_undefined_pd(), \
-                                   0 + (((C) >> 0) & 0x3), \
-                                   0 + (((C) >> 2) & 0x3), \
-                                   0 + (((C) >> 4) & 0x3), \
-                                   0 + (((C) >> 6) & 0x3), \
-                                   4 + (((C) >> 0) & 0x3), \
-                                   4 + (((C) >> 2) & 0x3), \
-                                   4 + (((C) >> 4) & 0x3), \
-                                   4 + (((C) >> 6) & 0x3)); })
+#define _mm512_permutex_pd(X, C) \
+  (__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C))
 
-#define _mm512_mask_permutex_pd(W, U, X, C) __extension__ ({ \
+#define _mm512_mask_permutex_pd(W, U, X, C) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                        (__v8df)_mm512_permutex_pd((X), (C)), \
-                                       (__v8df)(__m512d)(W)); })
+                                       (__v8df)(__m512d)(W))
 
-#define _mm512_maskz_permutex_pd(U, X, C) __extension__ ({ \
+#define _mm512_maskz_permutex_pd(U, X, C) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                        (__v8df)_mm512_permutex_pd((X), (C)), \
-                                       (__v8df)_mm512_setzero_pd()); })
+                                       (__v8df)_mm512_setzero_pd())
 
-#define _mm512_permutex_epi64(X, C) __extension__ ({ \
-  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(X), \
-                                   (__v8di)_mm512_undefined_epi32(), \
-                                   0 + (((C) >> 0) & 0x3), \
-                                   0 + (((C) >> 2) & 0x3), \
-                                   0 + (((C) >> 4) & 0x3), \
-                                   0 + (((C) >> 6) & 0x3), \
-                                   4 + (((C) >> 0) & 0x3), \
-                                   4 + (((C) >> 2) & 0x3), \
-                                   4 + (((C) >> 4) & 0x3), \
-                                   4 + (((C) >> 6) & 0x3)); })
+#define _mm512_permutex_epi64(X, C) \
+  (__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C))
 
-#define _mm512_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
+#define _mm512_mask_permutex_epi64(W, U, X, C) \
   (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                       (__v8di)_mm512_permutex_epi64((X), (C)), \
-                                      (__v8di)(__m512i)(W)); })
+                                      (__v8di)(__m512i)(W))
 
-#define _mm512_maskz_permutex_epi64(U, X, C) __extension__ ({ \
+#define _mm512_maskz_permutex_epi64(U, X, C) \
   (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                       (__v8di)_mm512_permutex_epi64((X), (C)), \
-                                      (__v8di)_mm512_setzero_si512()); })
+                                      (__v8di)_mm512_setzero_si512())
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_permutexvar_pd (__m512i __X, __m512d __Y)
 {
-  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
-                 (__v8di) __X,
-                 (__v8df) _mm512_undefined_pd (),
-                 (__mmask8) -1);
+  return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
 {
-  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
-                 (__v8di) __X,
-                 (__v8df) __W,
-                 (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                        (__v8df)_mm512_permutexvar_pd(__X, __Y),
+                                        (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
 {
-  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
-                 (__v8di) __X,
-                 (__v8df) _mm512_setzero_pd (),
-                 (__mmask8) __U);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                        (__v8df)_mm512_permutexvar_pd(__X, __Y),
+                                        (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
-                 (__v8di) __X,
-                 (__v8di) _mm512_setzero_si512 (),
-                 __M);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
-                 (__v8di) __X,
-                 (__v8di) _mm512_undefined_epi32 (),
-                 (__mmask8) -1);
+  return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                     (__v8di)_mm512_permutexvar_epi64(__X, __Y),
+                                     (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
              __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
-                 (__v8di) __X,
-                 (__v8di) __W,
-                 __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                     (__v8di)_mm512_permutexvar_epi64(__X, __Y),
+                                     (__v8di)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_permutexvar_ps (__m512i __X, __m512 __Y)
 {
-  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
-                (__v16si) __X,
-                (__v16sf) _mm512_undefined_ps (),
-                (__mmask16) -1);
+  return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
 {
-  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
-                (__v16si) __X,
-                (__v16sf) __W,
-                (__mmask16) __U);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                       (__v16sf)_mm512_permutexvar_ps(__X, __Y),
+                                       (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
 {
-  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
-                (__v16si) __X,
-                (__v16sf) _mm512_setzero_ps (),
-                (__mmask16) __U);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                       (__v16sf)_mm512_permutexvar_ps(__X, __Y),
+                                       (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
-                 (__v16si) __X,
-                 (__v16si) _mm512_setzero_si512 (),
-                 __M);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
-                 (__v16si) __X,
-                 (__v16si) _mm512_undefined_epi32 (),
-                 (__mmask16) -1);
+  return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                    (__v16si)_mm512_permutexvar_epi32(__X, __Y),
+                                    (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
              __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
-                 (__v16si) __X,
-                 (__v16si) __W,
-                 __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                    (__v16si)_mm512_permutexvar_epi32(__X, __Y),
+                                    (__v16si)__W);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_kand (__mmask16 __A, __mmask16 __B)
 {
   return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_kandn (__mmask16 __A, __mmask16 __B)
 {
   return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_kor (__mmask16 __A, __mmask16 __B)
 {
   return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_kortestc (__mmask16 __A, __mmask16 __B)
 {
   return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_kortestz (__mmask16 __A, __mmask16 __B)
 {
   return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
   return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_kxnor (__mmask16 __A, __mmask16 __B)
 {
   return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_kxor (__mmask16 __A, __mmask16 __B)
 {
   return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_stream_si512 (__m512i * __P, __m512i __A)
 {
-  __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
+  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
+  __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_stream_load_si512 (void *__P)
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_stream_load_si512 (void const *__P)
 {
-  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
+  return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_stream_pd (double *__P, __m512d __A)
 {
-  __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
+  typedef __v8df __v8df_aligned __attribute__((aligned(64)));
+  __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_stream_ps (float *__P, __m512 __A)
 {
-  __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
+  typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
+  __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
@@ -8892,7 +8404,7 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
@@ -8901,7 +8413,7 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
@@ -8909,7 +8421,7 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
@@ -8918,7 +8430,7 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
@@ -8926,7 +8438,7 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
@@ -8935,7 +8447,7 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
@@ -8943,7 +8455,7 @@
                   (__mmask16) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
@@ -8952,58 +8464,116 @@
                   (__mmask16) __U);
 }
 
-#define _mm_cmp_round_ss_mask(X, Y, P, R) __extension__ ({ \
+#define _mm_cmp_round_ss_mask(X, Y, P, R) \
   (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
                                       (__v4sf)(__m128)(Y), (int)(P), \
-                                      (__mmask8)-1, (int)(R)); })
+                                      (__mmask8)-1, (int)(R))
 
-#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) __extension__ ({ \
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
   (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
                                       (__v4sf)(__m128)(Y), (int)(P), \
-                                      (__mmask8)(M), (int)(R)); })
+                                      (__mmask8)(M), (int)(R))
 
-#define _mm_cmp_ss_mask(X, Y, P) __extension__ ({ \
+#define _mm_cmp_ss_mask(X, Y, P) \
   (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
                                       (__v4sf)(__m128)(Y), (int)(P), \
                                       (__mmask8)-1, \
-                                      _MM_FROUND_CUR_DIRECTION); })
+                                      _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_cmp_ss_mask(M, X, Y, P) __extension__ ({ \
+#define _mm_mask_cmp_ss_mask(M, X, Y, P) \
   (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
                                       (__v4sf)(__m128)(Y), (int)(P), \
                                       (__mmask8)(M), \
-                                      _MM_FROUND_CUR_DIRECTION); })
+                                      _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_cmp_round_sd_mask(X, Y, P, R) __extension__ ({ \
+#define _mm_cmp_round_sd_mask(X, Y, P, R) \
   (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
                                       (__v2df)(__m128d)(Y), (int)(P), \
-                                      (__mmask8)-1, (int)(R)); })
+                                      (__mmask8)-1, (int)(R))
 
-#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) __extension__ ({ \
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
   (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
                                       (__v2df)(__m128d)(Y), (int)(P), \
-                                      (__mmask8)(M), (int)(R)); })
+                                      (__mmask8)(M), (int)(R))
 
-#define _mm_cmp_sd_mask(X, Y, P) __extension__ ({ \
+#define _mm_cmp_sd_mask(X, Y, P) \
   (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
                                       (__v2df)(__m128d)(Y), (int)(P), \
                                       (__mmask8)-1, \
-                                      _MM_FROUND_CUR_DIRECTION); })
+                                      _MM_FROUND_CUR_DIRECTION)
 
-#define _mm_mask_cmp_sd_mask(M, X, Y, P) __extension__ ({ \
+#define _mm_mask_cmp_sd_mask(M, X, Y, P) \
   (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
                                       (__v2df)(__m128d)(Y), (int)(P), \
                                       (__mmask8)(M), \
-                                      _MM_FROUND_CUR_DIRECTION); })
+                                      _MM_FROUND_CUR_DIRECTION)
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+/* Bit Test */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_test_epi32_mask (__m512i __A, __m512i __B)
+{
+  return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
+                                   _mm512_setzero_si512());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
+                                        _mm512_setzero_si512());
+}
+
+static __inline __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_test_epi64_mask (__m512i __A, __m512i __B)
+{
+  return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
+                                   _mm512_setzero_si512());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
+                                        _mm512_setzero_si512());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+  return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
+                                  _mm512_setzero_si512());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
+                                       _mm512_setzero_si512());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+  return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
+                                  _mm512_setzero_si512());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
+                                       _mm512_setzero_si512());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_movehdup_ps (__m512 __A)
 {
   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
                          1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
@@ -9011,7 +8581,7 @@
                                              (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
 {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
@@ -9019,14 +8589,14 @@
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_moveldup_ps (__m512 __A)
 {
   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
                          0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
@@ -9034,7 +8604,7 @@
                                              (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
 {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
@@ -9042,37 +8612,94 @@
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-#define _mm512_shuffle_epi32(A, I) __extension__ ({ \
-  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
-                                   (__v16si)_mm512_undefined_epi32(), \
-                                   0  + (((I) >> 0) & 0x3), \
-                                   0  + (((I) >> 2) & 0x3), \
-                                   0  + (((I) >> 4) & 0x3), \
-                                   0  + (((I) >> 6) & 0x3), \
-                                   4  + (((I) >> 0) & 0x3), \
-                                   4  + (((I) >> 2) & 0x3), \
-                                   4  + (((I) >> 4) & 0x3), \
-                                   4  + (((I) >> 6) & 0x3), \
-                                   8  + (((I) >> 0) & 0x3), \
-                                   8  + (((I) >> 2) & 0x3), \
-                                   8  + (((I) >> 4) & 0x3), \
-                                   8  + (((I) >> 6) & 0x3), \
-                                   12 + (((I) >> 0) & 0x3), \
-                                   12 + (((I) >> 2) & 0x3), \
-                                   12 + (((I) >> 4) & 0x3), \
-                                   12 + (((I) >> 6) & 0x3)); })
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
+}
 
-#define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
+                                     _mm_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
+                                     _mm_setzero_pd());
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
+{
+  __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
+                                                (__v4sf)_mm_setzero_ps(),
+                                                0, 4, 4, 4);
+
+  return (__m128) __builtin_ia32_loadss128_mask ((__v4sf *) __A, src, __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_load_ss (__mmask8 __U, const float* __A)
+{
+  return (__m128)__builtin_ia32_loadss128_mask ((__v4sf *) __A,
+                                                (__v4sf) _mm_setzero_ps(),
+                                                __U & 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
+{
+  __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
+                                                 (__v2df)_mm_setzero_pd(),
+                                                 0, 2);
+
+  return (__m128d) __builtin_ia32_loadsd128_mask ((__v2df *) __A, src, __U & 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_load_sd (__mmask8 __U, const double* __A)
+{
+  return (__m128d) __builtin_ia32_loadsd128_mask ((__v2df *) __A,
+                                                  (__v2df) _mm_setzero_pd(),
+                                                  __U & 1);
+}
+
+#define _mm512_shuffle_epi32(A, I) \
+  (__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I))
+
+#define _mm512_mask_shuffle_epi32(W, U, A, I) \
   (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                       (__v16si)_mm512_shuffle_epi32((A), (I)), \
-                                      (__v16si)(__m512i)(W)); })
+                                      (__v16si)(__m512i)(W))
 
-#define _mm512_maskz_shuffle_epi32(U, A, I) __extension__ ({ \
+#define _mm512_maskz_shuffle_epi32(U, A, I) \
   (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                       (__v16si)_mm512_shuffle_epi32((A), (I)), \
-                                      (__v16si)_mm512_setzero_si512()); })
+                                      (__v16si)_mm512_setzero_si512())
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
@@ -9080,7 +8707,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
@@ -9088,7 +8715,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
@@ -9096,15 +8723,15 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
-                (__v8di) _mm512_setzero_pd (),
+                (__v8di) _mm512_setzero_si512 (),
                 (__mmask8) __U);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
 {
   return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
@@ -9112,7 +8739,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
 {
   return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
@@ -9120,7 +8747,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
@@ -9128,15 +8755,15 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
-              (__v8di) _mm512_setzero_pd(),
+              (__v8di) _mm512_setzero_si512(),
               (__mmask8) __U);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
 {
   return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
@@ -9144,7 +8771,7 @@
                    (__mmask16) __U);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
 {
   return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
@@ -9152,7 +8779,7 @@
                    (__mmask16) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
@@ -9160,15 +8787,15 @@
               (__mmask16) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
-              (__v16si) _mm512_setzero_ps(),
+              (__v16si) _mm512_setzero_si512(),
               (__mmask16) __U);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
@@ -9176,7 +8803,7 @@
                (__mmask16) __U);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
@@ -9184,7 +8811,7 @@
                (__mmask16) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
@@ -9192,59 +8819,64 @@
                 (__mmask16) __U);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
-                (__v16si) _mm512_setzero_ps(),
+                (__v16si) _mm512_setzero_si512(),
                 (__mmask16) __U);
 }
 
-#define _mm512_cvt_roundps_pd(A, R) __extension__ ({ \
+#define _mm512_cvt_roundps_pd(A, R) \
   (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
                                            (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1, (int)(R)); })
+                                           (__mmask8)-1, (int)(R))
 
-#define _mm512_mask_cvt_roundps_pd(W, U, A, R) __extension__ ({ \
+#define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
   (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
                                            (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-#define _mm512_maskz_cvt_roundps_pd(U, A, R) __extension__ ({ \
+#define _mm512_maskz_cvt_roundps_pd(U, A, R) \
   (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)); })
+                                           (__mmask8)(U), (int)(R))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_cvtps_pd (__m256 __A)
 {
-  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
-                (__v8df)
-                _mm512_undefined_pd (),
-                (__mmask8) -1,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
 {
-  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
-                (__v8df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_cvtps_pd(__A),
+                                              (__v8df)__W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
 {
-  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
-                (__v8df)
-                _mm512_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_cvtps_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtpslo_pd (__m512 __A)
+{
+  return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
+{
+  return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
@@ -9252,7 +8884,7 @@
               (__v8df) __W);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
 {
   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
@@ -9260,7 +8892,7 @@
               (__v8df) _mm512_setzero_pd ());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
@@ -9268,7 +8900,7 @@
              (__v16sf) __W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
 {
   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
@@ -9276,191 +8908,243 @@
              (__v16sf) _mm512_setzero_ps ());
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
 {
   __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
 {
   __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
 {
   __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
             (__mmask16) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS512
 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
 {
   __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
             (__mmask16) __U);
 }
 
-#define _mm_cvt_roundsd_ss(A, B, R) __extension__ ({ \
+#define _mm_cvt_roundsd_ss(A, B, R) \
   (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v4sf)_mm_undefined_ps(), \
-                                             (__mmask8)-1, (int)(R)); })
+                                             (__mmask8)-1, (int)(R))
 
-#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
   (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v4sf)(__m128)(W), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
   (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(U), (int)(R)); })
+                                             (__mmask8)(U), (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
 {
-  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
-                                             (__v2df)(__B),
-                                             (__v4sf)(__W), 
-                                             (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
+                                             (__v2df)__B,
+                                             (__v4sf)__W,
+                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
 {
-  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
-                                             (__v2df)(__B),
-                                             (__v4sf)_mm_setzero_ps(), 
-                                             (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
+                                             (__v2df)__B,
+                                             (__v4sf)_mm_setzero_ps(),
+                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm_cvtss_i32 _mm_cvtss_si32
-#define _mm_cvtss_i64 _mm_cvtss_si64
 #define _mm_cvtsd_i32 _mm_cvtsd_si32
-#define _mm_cvtsd_i64 _mm_cvtsd_si64
 #define _mm_cvti32_sd _mm_cvtsi32_sd
-#define _mm_cvti64_sd _mm_cvtsi64_sd
 #define _mm_cvti32_ss _mm_cvtsi32_ss
+#ifdef __x86_64__
+#define _mm_cvtss_i64 _mm_cvtss_si64
+#define _mm_cvtsd_i64 _mm_cvtsd_si64
+#define _mm_cvti64_sd _mm_cvtsi64_sd
 #define _mm_cvti64_ss _mm_cvtsi64_ss
+#endif
 
-#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
+#ifdef __x86_64__
+#define _mm_cvt_roundi64_sd(A, B, R) \
   (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
-                                     (int)(R)); })
+                                     (int)(R))
 
-#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
+#define _mm_cvt_roundsi64_sd(A, B, R) \
   (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
-                                     (int)(R)); })
+                                     (int)(R))
+#endif
 
-#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+#define _mm_cvt_roundsi32_ss(A, B, R) \
+  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))
 
-#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+#define _mm_cvt_roundi32_ss(A, B, R) \
+  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))
 
-#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
+#ifdef __x86_64__
+#define _mm_cvt_roundsi64_ss(A, B, R) \
   (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
-                                    (int)(R)); })
+                                    (int)(R))
 
-#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
+#define _mm_cvt_roundi64_ss(A, B, R) \
   (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
-                                    (int)(R)); })
+                                    (int)(R))
+#endif
 
-#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
+#define _mm_cvt_roundss_sd(A, B, R) \
   (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v2df)_mm_undefined_pd(), \
-                                              (__mmask8)-1, (int)(R)); })
+                                              (__mmask8)-1, (int)(R))
 
-#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) __extension__ ({ \
+#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
   (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v2df)(__m128d)(W), \
-                                              (__mmask8)(U), (int)(R)); })
+                                              (__mmask8)(U), (int)(R))
 
-#define _mm_maskz_cvt_roundss_sd(U, A, B, R) __extension__ ({ \
+#define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
   (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(U), (int)(R)); })
+                                              (__mmask8)(U), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
 {
-  return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
-                                              (__v4sf)(__B),
-                                              (__v2df)(__W),
-                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+  return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
+                                            (__v4sf)__B,
+                                            (__v2df)__W,
+                                            (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
 {
-  return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
-                                              (__v4sf)(__B),
-                                              (__v2df)_mm_setzero_pd(), 
-                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+  return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
+                                            (__v4sf)__B,
+                                            (__v2df)_mm_setzero_pd(),
+                                            (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_cvtu32_sd (__m128d __A, unsigned __B)
 {
-  return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
+  __A[0] = __B;
+  return __A;
 }
 
-#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_sd(A, B, R) \
   (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
-                                      (unsigned long long)(B), (int)(R)); })
+                                      (unsigned long long)(B), (int)(R))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
 {
-  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
-                 _MM_FROUND_CUR_DIRECTION);
+  __A[0] = __B;
+  return __A;
 }
+#endif
 
-#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
+#define _mm_cvt_roundu32_ss(A, B, R) \
   (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
-                                     (int)(R)); })
+                                     (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtu32_ss (__m128 __A, unsigned __B)
 {
-  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
-                _MM_FROUND_CUR_DIRECTION);
+  __A[0] = __B;
+  return __A;
 }
 
-#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_ss(A, B, R) \
   (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
-                                     (unsigned long long)(B), (int)(R)); })
+                                     (unsigned long long)(B), (int)(R))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
 {
-  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
-                _MM_FROUND_CUR_DIRECTION);
+  __A[0] = __B;
+  return __A;
 }
+#endif
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
-                 __M);
+  return (__m512i) __builtin_ia32_selectd_512(__M,
+                                              (__v16si) _mm512_set1_epi32(__A),
+                                              (__v16si) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
-                 __M);
+  return (__m512i) __builtin_ia32_selectq_512(__M,
+                                              (__v8di) _mm512_set1_epi64(__A),
+                                              (__v8di) __O);
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
+static  __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
+    char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
+    char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
+    char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
+    char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
+    char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
+    char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
+    char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
+    char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
+    char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
+    char __e4, char __e3, char __e2, char __e1, char __e0) {
+
+  return __extension__ (__m512i)(__v64qi)
+    {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
+     __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
+     __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
+     __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
+     __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
+     __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
+     __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
+     __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
+}
+
+static  __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
+    short __e27, short __e26, short __e25, short __e24, short __e23,
+    short __e22, short __e21, short __e20, short __e19, short __e18,
+    short __e17, short __e16, short __e15, short __e14, short __e13,
+    short __e12, short __e11, short __e10, short __e9, short __e8,
+    short __e7, short __e6, short __e5, short __e4, short __e3,
+    short __e2, short __e1, short __e0) {
+  return __extension__ (__m512i)(__v32hi)
+    {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
+     __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
+     __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
+     __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_set_epi32 (int __A, int __B, int __C, int __D,
      int __E, int __F, int __G, int __H,
      int __I, int __J, int __K, int __L,
@@ -9476,7 +9160,7 @@
   _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
                    (e5),(e4),(e3),(e2),(e1),(e0))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_set_epi64 (long long __A, long long __B, long long __C,
      long long __D, long long __E, long long __F,
      long long __G, long long __H)
@@ -9488,7 +9172,7 @@
 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
   _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_set_pd (double __A, double __B, double __C, double __D,
         double __E, double __F, double __G, double __H)
 {
@@ -9499,7 +9183,7 @@
 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
   _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_set_ps (float __A, float __B, float __C, float __D,
         float __E, float __F, float __G, float __H,
         float __I, float __J, float __K, float __L,
@@ -9514,30 +9198,401 @@
   _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
                 (e4),(e3),(e2),(e1),(e0))
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_abs_ps(__m512 A)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_abs_ps(__m512 __A)
 {
-  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_abs_ps(__m512 W, __mmask16 K, __m512 A)
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
 {
-  return (__m512)_mm512_mask_and_epi32((__m512i)W, K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+  return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_abs_pd(__m512d A)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_abs_pd(__m512d __A)
 {
-  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A) ;
+  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_abs_pd(__m512d W, __mmask8 K, __m512d A)
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
 {
-  return (__m512d)_mm512_mask_and_epi64((__v8di)W, K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A);
+  return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
 }
 
-#undef __DEFAULT_FN_ATTRS
+/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
+ * outputs. This class of vector operation forms the basis of many scientific
+ * computations. In vector-reduction arithmetic, the evaluation off is
+ * independent of the order of the input elements of V.
 
-#endif // __AVX512FINTRIN_H
+ * Used bisection method. At each step, we partition the vector with previous
+ * step in half, and the operation is performed on its two halves.
+ * This takes log2(n) steps where n is the number of elements in the vector.
+ */
+
+#define _mm512_mask_reduce_operator(op) \
+  __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \
+  __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \
+  __m256i __t3 = (__m256i)(__t1 op __t2); \
+  __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \
+  __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \
+  __v2du __t6 = __t4 op __t5; \
+  __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
+  __v2du __t8 = __t6 op __t7; \
+  return __t8[0];
+
+static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
+  _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
+  _mm512_mask_reduce_operator(*);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
+  _mm512_mask_reduce_operator(&);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
+  _mm512_mask_reduce_operator(|);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
+  __W = _mm512_maskz_mov_epi64(__M, __W);
+  _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
+  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
+  _mm512_mask_reduce_operator(*);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
+  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
+  _mm512_mask_reduce_operator(&);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
+  __W = _mm512_maskz_mov_epi64(__M, __W);
+  _mm512_mask_reduce_operator(|);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+  __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \
+  __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \
+  __m256d __t3 = __t1 op __t2; \
+  __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
+  __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
+  __m128d __t6 = __t4 op __t5; \
+  __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
+  __m128d __t8 = __t6 op __t7; \
+  return __t8[0];
+
+static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
+  _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
+  _mm512_mask_reduce_operator(*);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
+  __W = _mm512_maskz_mov_pd(__M, __W);
+  _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
+  __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
+  _mm512_mask_reduce_operator(*);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+  __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \
+  __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \
+  __m256i __t3 = (__m256i)(__t1 op __t2); \
+  __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \
+  __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \
+  __v4su __t6 = __t4 op __t5; \
+  __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
+  __v4su __t8 = __t6 op __t7; \
+  __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
+  __v4su __t10 = __t8 op __t9; \
+  return __t10[0];
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_add_epi32(__m512i __W) {
+  _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_mul_epi32(__m512i __W) {
+  _mm512_mask_reduce_operator(*);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_and_epi32(__m512i __W) {
+  _mm512_mask_reduce_operator(&);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_or_epi32(__m512i __W) {
+  _mm512_mask_reduce_operator(|);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
+  __W = _mm512_maskz_mov_epi32(__M, __W);
+  _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
+  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
+  _mm512_mask_reduce_operator(*);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
+  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
+  _mm512_mask_reduce_operator(&);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
+  __W = _mm512_maskz_mov_epi32(__M, __W);
+  _mm512_mask_reduce_operator(|);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+  __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \
+  __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \
+  __m256 __t3 = __t1 op __t2; \
+  __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
+  __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
+  __m128 __t6 = __t4 op __t5; \
+  __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
+  __m128 __t8 = __t6 op __t7; \
+  __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
+  __m128 __t10 = __t8 op __t9; \
+  return __t10[0];
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_reduce_add_ps(__m512 __W) {
+  _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_reduce_mul_ps(__m512 __W) {
+  _mm512_mask_reduce_operator(*);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
+  __W = _mm512_maskz_mov_ps(__M, __W);
+  _mm512_mask_reduce_operator(+);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
+  __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
+  _mm512_mask_reduce_operator(*);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+  __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \
+  __m512i __t2 = _mm512_##op(__V, __t1); \
+  __m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \
+  __m512i __t4 = _mm512_##op(__t2, __t3); \
+  __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \
+  __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \
+  return __t6[0];
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_epi64(__m512i __V) {
+  _mm512_mask_reduce_operator(max_epi64);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_epu64(__m512i __V) {
+  _mm512_mask_reduce_operator(max_epu64);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_epi64(__m512i __V) {
+  _mm512_mask_reduce_operator(min_epi64);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_epu64(__m512i __V) {
+  _mm512_mask_reduce_operator(min_epu64);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
+  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
+  _mm512_mask_reduce_operator(max_epi64);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
+  __V = _mm512_maskz_mov_epi64(__M, __V);
+  _mm512_mask_reduce_operator(max_epu64);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
+  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
+  _mm512_mask_reduce_operator(min_epi64);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
+  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
+  _mm512_mask_reduce_operator(min_epu64);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+  __m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \
+  __m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \
+  __m256i __t3 = _mm256_##op(__t1, __t2); \
+  __m128i __t4 = _mm256_extracti128_si256(__t3, 0); \
+  __m128i __t5 = _mm256_extracti128_si256(__t3, 1); \
+  __m128i __t6 = _mm_##op(__t4, __t5); \
+  __m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \
+  __m128i __t8 = _mm_##op(__t6, __t7); \
+  __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \
+  __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \
+  return __t10[0];
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_epi32(__m512i __V) {
+  _mm512_mask_reduce_operator(max_epi32);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_epu32(__m512i __V) {
+  _mm512_mask_reduce_operator(max_epu32);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_epi32(__m512i __V) {
+  _mm512_mask_reduce_operator(min_epi32);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_epu32(__m512i __V) {
+  _mm512_mask_reduce_operator(min_epu32);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
+  _mm512_mask_reduce_operator(max_epi32);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+  __V = _mm512_maskz_mov_epi32(__M, __V);
+  _mm512_mask_reduce_operator(max_epu32);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
+  _mm512_mask_reduce_operator(min_epi32);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
+  _mm512_mask_reduce_operator(min_epu32);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+  __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
+  __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
+  __m256d __t3 = _mm256_##op(__t1, __t2); \
+  __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
+  __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
+  __m128d __t6 = _mm_##op(__t4, __t5); \
+  __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
+  __m128d __t8 = _mm_##op(__t6, __t7); \
+  return __t8[0];
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_pd(__m512d __V) {
+  _mm512_mask_reduce_operator(max_pd);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_pd(__m512d __V) {
+  _mm512_mask_reduce_operator(min_pd);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+  __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
+  _mm512_mask_reduce_operator(max_pd);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+  __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
+  _mm512_mask_reduce_operator(min_pd);
+}
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+  __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \
+  __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \
+  __m256 __t3 = _mm256_##op(__t1, __t2); \
+  __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
+  __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
+  __m128 __t6 = _mm_##op(__t4, __t5); \
+  __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
+  __m128 __t8 = _mm_##op(__t6, __t7); \
+  __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
+  __m128 __t10 = _mm_##op(__t8, __t9); \
+  return __t10[0];
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_ps(__m512 __V) {
+  _mm512_mask_reduce_operator(max_ps);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_ps(__m512 __V) {
+  _mm512_mask_reduce_operator(min_ps);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+  __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
+  _mm512_mask_reduce_operator(max_ps);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
+  __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
+  _mm512_mask_reduce_operator(min_ps);
+}
+#undef _mm512_mask_reduce_operator
+
+#undef __DEFAULT_FN_ATTRS512
+#undef __DEFAULT_FN_ATTRS128
+
+#endif /* __AVX512FINTRIN_H */
diff --git a/darwin-x86/clang-headers/avx512ifmaintrin.h b/darwin-x86/clang-headers/avx512ifmaintrin.h
index 5defbae..1597130 100644
--- a/darwin-x86/clang-headers/avx512ifmaintrin.h
+++ b/darwin-x86/clang-headers/avx512ifmaintrin.h
@@ -29,62 +29,52 @@
 #define __IFMAINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
 {
-  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
-                   (__v8di) __Y,
-                   (__v8di) __Z,
-                   (__mmask8) -1);
+  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
+                                                (__v8di) __Z);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
-          __m512i __Y)
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
-                   (__v8di) __X,
-                   (__v8di) __Y,
-                   (__mmask8) __M);
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
+                                   (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
 {
-  return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
-              (__v8di) __Y,
-              (__v8di) __Z,
-              (__mmask8) __M);
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
+                                   (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
 {
-  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
-                   (__v8di) __Y,
-                   (__v8di) __Z,
-                   (__mmask8) -1);
+  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
+                                                (__v8di) __Z);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
-          __m512i __Y)
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
-                   (__v8di) __X,
-                   (__v8di) __Y,
-                   (__mmask8) __M);
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
+                                   (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
 {
-  return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
-              (__v8di) __Y,
-              (__v8di) __Z,
-              (__mmask8) __M);
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
+                                   (__v8di)_mm512_setzero_si512());
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/darwin-x86/clang-headers/avx512ifmavlintrin.h b/darwin-x86/clang-headers/avx512ifmavlintrin.h
index 131ee5c..afdea88 100644
--- a/darwin-x86/clang-headers/avx512ifmavlintrin.h
+++ b/darwin-x86/clang-headers/avx512ifmavlintrin.h
@@ -29,121 +29,105 @@
 #define __IFMAVLINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
 
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
 {
-  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
-                   (__v2di) __Y,
-                   (__v2di) __Z,
-                   (__mmask8) -1);
+  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
+                                                (__v2di) __Z);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
-                   (__v2di) __X,
-                   (__v2di) __Y,
-                   (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
+                                      (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
 {
-  return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di) __Z,
-              (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
+                                      (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
 {
-  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
-                   (__v4di) __Y,
-                   (__v4di) __Z,
-                   (__mmask8) -1);
+  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
-          __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
-                   (__v4di) __X,
-                   (__v4di) __Y,
-                   (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
+                                   (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
 {
-  return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di) __Z,
-              (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
+                                   (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
 {
-  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
-                   (__v2di) __Y,
-                   (__v2di) __Z,
-                   (__mmask8) -1);
+  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
-                   (__v2di) __X,
-                   (__v2di) __Y,
-                   (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
+                                      (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
 {
-  return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di) __Z,
-              (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
+                                      (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
 {
-  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
-                   (__v4di) __Y,
-                   (__v4di) __Z,
-                   (__mmask8) -1);
+  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
-          __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
-                   (__v4di) __X,
-                   (__v4di) __Y,
-                   (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
+                                   (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
 {
-  return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di) __Z,
-              (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
+                                   (__v4di)_mm256_setzero_si256());
 }
 
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif
diff --git a/darwin-x86/clang-headers/avx512pfintrin.h b/darwin-x86/clang-headers/avx512pfintrin.h
index c7fa3cf..5b8260b 100644
--- a/darwin-x86/clang-headers/avx512pfintrin.h
+++ b/darwin-x86/clang-headers/avx512pfintrin.h
@@ -1,4 +1,4 @@
-/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
  *
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -31,80 +31,80 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
 
-#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
   __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
                              (long long const *)(addr), (int)(scale), \
-                             (int)(hint)); })
-              
-#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
+                             (int)(hint))
+
+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
   __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
                              (long long const *)(addr), (int)(scale), \
-                             (int)(hint)); })
+                             (int)(hint))
 
-#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
   __builtin_ia32_gatherpfdps((__mmask16)(mask), \
                              (__v16si)(__m512i)(index), (int const *)(addr), \
-                             (int)(scale), (int)(hint)); })
+                             (int)(scale), (int)(hint))
 
-#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
   __builtin_ia32_gatherpfdps((__mmask16) -1, \
                              (__v16si)(__m512i)(index), (int const *)(addr), \
-                             (int)(scale), (int)(hint)); })
+                             (int)(scale), (int)(hint))
 
-#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
   __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
                              (long long const *)(addr), (int)(scale), \
-                             (int)(hint)); })
+                             (int)(hint))
 
-#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
   __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
                              (long long const *)(addr), (int)(scale), \
-                             (int)(hint)); })
-              
-#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
+                             (int)(hint))
+
+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
   __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (int const *)(addr), (int)(scale), (int)(hint)); })
+                             (int const *)(addr), (int)(scale), (int)(hint))
 
-#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
   __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (int const *)(addr), (int)(scale), (int)(hint)); })
+                             (int const *)(addr), (int)(scale), (int)(hint))
 
-#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
   __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
                               (long long *)(addr), (int)(scale), \
-                              (int)(hint)); })
+                              (int)(hint))
 
-#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
   __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
                               (long long *)(addr), (int)(scale), \
-                              (int)(hint)); })
+                              (int)(hint))
 
-#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
   __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
-                              (int *)(addr), (int)(scale), (int)(hint)); })
+                              (int *)(addr), (int)(scale), (int)(hint))
 
-#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
   __builtin_ia32_scatterpfdps((__mmask16)(mask), \
                               (__v16si)(__m512i)(index), (int *)(addr), \
-                              (int)(scale), (int)(hint)); })
+                              (int)(scale), (int)(hint))
 
-#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
   __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
                               (long long *)(addr), (int)(scale), \
-                              (int)(hint)); })
+                              (int)(hint))
 
-#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
   __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
                               (long long *)(addr), (int)(scale), \
-                              (int)(hint)); })
+                              (int)(hint))
 
-#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
   __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (int *)(addr), (int)(scale), (int)(hint)); })
+                              (int *)(addr), (int)(scale), (int)(hint))
 
-#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
   __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (int *)(addr), (int)(scale), (int)(hint)); })
+                              (int *)(addr), (int)(scale), (int)(hint))
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/darwin-x86/clang-headers/avx512vbmi2intrin.h b/darwin-x86/clang-headers/avx512vbmi2intrin.h
new file mode 100644
index 0000000..d2a5809
--- /dev/null
+++ b/darwin-x86/clang-headers/avx512vbmi2intrin.h
@@ -0,0 +1,397 @@
+/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VBMI2INTRIN_H
+#define __AVX512VBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512)))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+              (__v32hi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+              (__v64qi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
+{
+  __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
+{
+  __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+              (__v32hi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+              (__v64qi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+              (__v32hi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+              (__v64qi) _mm512_setzero_si512(),
+              __U);
+}
+
+#define _mm512_shldi_epi64(A, B, I) \
+  (__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
+                                     (__v8di)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                    (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+                                    (__v8di)(__m512i)(S))
+
+#define _mm512_maskz_shldi_epi64(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                    (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+                                    (__v8di)_mm512_setzero_si512())
+
+#define _mm512_shldi_epi32(A, B, I) \
+  (__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
+                                     (__v16si)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                   (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+                                   (__v16si)(__m512i)(S))
+
+#define _mm512_maskz_shldi_epi32(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                   (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+                                   (__v16si)_mm512_setzero_si512())
+
+#define _mm512_shldi_epi16(A, B, I) \
+  (__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
+                                     (__v32hi)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                   (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+                                   (__v32hi)(__m512i)(S))
+
+#define _mm512_maskz_shldi_epi16(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                   (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+                                   (__v32hi)_mm512_setzero_si512())
+
+#define _mm512_shrdi_epi64(A, B, I) \
+  (__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
+                                     (__v8di)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                    (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+                                    (__v8di)(__m512i)(S))
+
+#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                    (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+                                    (__v8di)_mm512_setzero_si512())
+
+#define _mm512_shrdi_epi32(A, B, I) \
+  (__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
+                                     (__v16si)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                   (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+                                   (__v16si)(__m512i)(S))
+
+#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                   (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+                                   (__v16si)_mm512_setzero_si512())
+
+#define _mm512_shrdi_epi16(A, B, I) \
+  (__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
+                                     (__v32hi)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                   (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+                                   (__v32hi)(__m512i)(S))
+
+#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                   (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+                                   (__v32hi)_mm512_setzero_si512())
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
+
diff --git a/darwin-x86/clang-headers/avx512vbmiintrin.h b/darwin-x86/clang-headers/avx512vbmiintrin.h
index 837238e..b6e93c2 100644
--- a/darwin-x86/clang-headers/avx512vbmiintrin.h
+++ b/darwin-x86/clang-headers/avx512vbmiintrin.h
@@ -29,79 +29,65 @@
 #define __VBMIINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512)))
 
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
-         __mmask64 __U, __m512i __B)
+_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
-              (__v64qi) __I
-              /* idx */ ,
-              (__v64qi) __B,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
+                                                 (__v64qi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
+_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
+                              __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
-              /* idx */ ,
-              (__v64qi) __A,
-              (__v64qi) __B,
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
-        __m512i __I, __m512i __B)
+_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
+                               __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
-              /* idx */ ,
-              (__v64qi) __A,
-              (__v64qi) __B,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)__I);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
-         __m512i __I, __m512i __B)
+_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
+                               __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
-               /* idx */ ,
-               (__v64qi) __A,
-               (__v64qi) __B,
-               (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
-                 (__v64qi) __A,
-                 (__v64qi) _mm512_undefined_epi32 (),
-                 (__mmask64) -1);
+  return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
         __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
-                 (__v64qi) __A,
-                 (__v64qi) _mm512_setzero_si512(),
-                 (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
+                                     (__v64qi)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
              __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
-                 (__v64qi) __A,
-                 (__v64qi) __W,
-                 (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
+                                     (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/darwin-x86/clang-headers/avx512vbmivlintrin.h b/darwin-x86/clang-headers/avx512vbmivlintrin.h
index 105c6d1..9a0400b 100644
--- a/darwin-x86/clang-headers/avx512vbmivlintrin.h
+++ b/darwin-x86/clang-headers/avx512vbmivlintrin.h
@@ -29,161 +29,127 @@
 #define __VBMIVLINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256)))
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
-            __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
-              (__v16qi) __I
-              /* idx */ ,
-              (__v16qi) __B,
-              (__mmask16)
-              __U);
+  return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
+                                                 (__v16qi)__I,
+                                                 (__v16qi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
-         __mmask32 __U, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
+                           __m128i __B)
 {
-  return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
-              (__v32qi) __I
-              /* idx */ ,
-              (__v32qi) __B,
-              (__mmask32)
-              __U);
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
+                            __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
-              /* idx */ ,
-              (__v16qi) __A,
-              (__v16qi) __B,
-              (__mmask16) -
-              1);
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
-           __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
+                            __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
-              /* idx */ ,
-              (__v16qi) __A,
-              (__v16qi) __B,
-              (__mmask16)
-              __U);
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
-            __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
-               /* idx */ ,
-               (__v16qi) __A,
-               (__v16qi) __B,
-               (__mmask16)
-               __U);
+  return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
+                                                 (__v32qi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
+                              __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
-              /* idx */ ,
-              (__v32qi) __A,
-              (__v32qi) __B,
-              (__mmask32) -
-              1);
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
-        __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
+                               __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
-              /* idx */ ,
-              (__v32qi) __A,
-              (__v32qi) __B,
-              (__mmask32)
-              __U);
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)__I);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
-         __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
+                               __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
-               /* idx */ ,
-               (__v32qi) __A,
-               (__v32qi) __B,
-               (__mmask32)
-               __U);
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_permutexvar_epi8 (__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
-                 (__v16qi) __A,
-                 (__v16qi) _mm_undefined_si128 (),
-                 (__mmask16) -1);
+  return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
-                 (__v16qi) __A,
-                 (__v16qi) _mm_setzero_si128 (),
-                 (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
+                                        (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
           __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
-                 (__v16qi) __A,
-                 (__v16qi) __W,
-                 (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
+                                        (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
-                 (__v32qi) __A,
-                 (__v32qi) _mm256_undefined_si256 (),
-                 (__mmask32) -1);
+  return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
         __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
-                 (__v32qi) __A,
-                 (__v32qi) _mm256_setzero_si256 (),
-                 (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
+                                     (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
              __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
-                 (__v32qi) __A,
-                 (__v32qi) __W,
-                 (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
+                                     (__v32qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
 {
   return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@@ -192,7 +158,7 @@
                 (__mmask16) __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
 {
   return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@@ -202,7 +168,7 @@
                 (__mmask16) __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
 {
   return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@@ -212,7 +178,7 @@
                 (__mmask16) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
 {
   return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@@ -221,7 +187,7 @@
                 (__mmask32) __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
 {
   return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@@ -231,7 +197,7 @@
                 (__mmask32) __M);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
 {
   return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@@ -242,6 +208,7 @@
 }
 
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif
diff --git a/darwin-x86/clang-headers/avx512vlbitalgintrin.h b/darwin-x86/clang-headers/avx512vlbitalgintrin.h
new file mode 100644
index 0000000..64860b2
--- /dev/null
+++ b/darwin-x86/clang-headers/avx512vlbitalgintrin.h
@@ -0,0 +1,159 @@
+/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBITALGINTRIN_H
+#define __AVX512VLBITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi16(__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
+              (__v16hi) _mm256_popcnt_epi16(__B),
+              (__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
+{
+  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
+              __U,
+              __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi16(__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
+              (__v8hi) _mm_popcnt_epi16(__B),
+              (__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
+{
+  return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
+              __U,
+              __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi8(__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
+              (__v32qi) _mm256_popcnt_epi8(__B),
+              (__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
+{
+  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
+              __U,
+              __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi8(__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
+              (__v16qi) _mm_popcnt_epi8(__B),
+              (__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
+{
+  return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
+              __U,
+              __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
+              (__v32qi) __B,
+              __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
+{
+  return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
+              __A,
+              __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
+              (__v16qi) __B,
+              __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
+{
+  return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
+              __A,
+              __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
diff --git a/darwin-x86/clang-headers/avx512vlbwintrin.h b/darwin-x86/clang-headers/avx512vlbwintrin.h
index 990e992..1b038dd 100644
--- a/darwin-x86/clang-headers/avx512vlbwintrin.h
+++ b/darwin-x86/clang-headers/avx512vlbwintrin.h
@@ -29,761 +29,432 @@
 #define __AVX512VLBWINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw")))
-
-static  __inline __m128i __DEFAULT_FN_ATTRS
-_mm_setzero_hi(void){
-    return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
-}
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(256)))
 
 /* Integer compare */
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
-                                                   (__mmask16)-1);
-}
+#define _mm_cmp_epi8_mask(a, b, p) \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), (int)(p), \
+                                         (__mmask16)-1)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
-                                                   __u);
-}
+#define _mm_mask_cmp_epi8_mask(m, a, b, p) \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), (int)(p), \
+                                         (__mmask16)(m))
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
-                                                 (__mmask16)-1);
-}
+#define _mm_cmp_epu8_mask(a, b, p) \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)-1)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
-                                                 __u);
-}
+#define _mm_mask_cmp_epu8_mask(m, a, b, p) \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)(m))
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
-                                                   (__mmask32)-1);
-}
+#define _mm256_cmp_epi8_mask(a, b, p) \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), (int)(p), \
+                                         (__mmask32)-1)
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
-                                                   __u);
-}
+#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), (int)(p), \
+                                         (__mmask32)(m))
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
-                                                 (__mmask32)-1);
-}
+#define _mm256_cmp_epu8_mask(a, b, p) \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)-1)
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
-                                                 __u);
-}
+#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)(m))
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
-                                                  (__mmask8)-1);
-}
+#define _mm_cmp_epi16_mask(a, b, p) \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
-                                                  __u);
-}
+#define _mm_mask_cmp_epi16_mask(m, a, b, p) \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m))
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
-                                                (__mmask8)-1);
-}
+#define _mm_cmp_epu16_mask(a, b, p) \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
-                                                __u);
-}
+#define _mm_mask_cmp_epu16_mask(m, a, b, p) \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m))
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
-                                                   (__mmask16)-1);
-}
+#define _mm256_cmp_epi16_mask(a, b, p) \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), (int)(p), \
+                                         (__mmask16)-1)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
-                                                   __u);
-}
+#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), (int)(p), \
+                                         (__mmask16)(m))
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
-                                                 (__mmask16)-1);
-}
+#define _mm256_cmp_epu16_mask(a, b, p) \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)-1)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
-                                                 __u);
-}
+#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)(m))
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
-                                                (__mmask16)-1);
-}
+#define _mm_cmpeq_epi8_mask(A, B) \
+    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epi8_mask(k, A, B) \
+    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epi8_mask(A, B) \
+    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epi8_mask(k, A, B) \
+    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epi8_mask(A, B) \
+    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epi8_mask(k, A, B) \
+    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epi8_mask(A, B) \
+    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epi8_mask(k, A, B) \
+    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epi8_mask(A, B) \
+    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epi8_mask(k, A, B) \
+    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epi8_mask(A, B) \
+    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epi8_mask(k, A, B) \
+    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
-                                                __u);
-}
+#define _mm256_cmpeq_epi8_mask(A, B) \
+    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epi8_mask(k, A, B) \
+    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epi8_mask(A, B) \
+    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epi8_mask(k, A, B) \
+    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epi8_mask(A, B) \
+    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epi8_mask(k, A, B) \
+    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epi8_mask(A, B) \
+    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epi8_mask(k, A, B) \
+    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epi8_mask(A, B) \
+    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epi8_mask(k, A, B) \
+    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epi8_mask(A, B) \
+    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epi8_mask(k, A, B) \
+    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
-                                                 (__mmask16)-1);
-}
+#define _mm_cmpeq_epu8_mask(A, B) \
+    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epu8_mask(k, A, B) \
+    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epu8_mask(A, B) \
+    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epu8_mask(k, A, B) \
+    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epu8_mask(A, B) \
+    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epu8_mask(k, A, B) \
+    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epu8_mask(A, B) \
+    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epu8_mask(k, A, B) \
+    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epu8_mask(A, B) \
+    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epu8_mask(k, A, B) \
+    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epu8_mask(A, B) \
+    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epu8_mask(k, A, B) \
+    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
-                                                 __u);
-}
+#define _mm256_cmpeq_epu8_mask(A, B) \
+    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epu8_mask(k, A, B) \
+    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epu8_mask(A, B) \
+    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epu8_mask(k, A, B) \
+    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epu8_mask(A, B) \
+    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epu8_mask(k, A, B) \
+    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epu8_mask(A, B) \
+    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epu8_mask(k, A, B) \
+    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epu8_mask(A, B) \
+    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epu8_mask(k, A, B) \
+    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epu8_mask(A, B) \
+    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epu8_mask(k, A, B) \
+    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
-                                                (__mmask32)-1);
-}
+#define _mm_cmpeq_epi16_mask(A, B) \
+    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epi16_mask(k, A, B) \
+    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epi16_mask(A, B) \
+    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epi16_mask(k, A, B) \
+    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epi16_mask(A, B) \
+    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epi16_mask(k, A, B) \
+    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epi16_mask(A, B) \
+    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epi16_mask(k, A, B) \
+    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epi16_mask(A, B) \
+    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epi16_mask(k, A, B) \
+    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epi16_mask(A, B) \
+    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epi16_mask(k, A, B) \
+    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
-                                                __u);
-}
+#define _mm256_cmpeq_epi16_mask(A, B) \
+    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epi16_mask(k, A, B) \
+    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epi16_mask(A, B) \
+    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epi16_mask(k, A, B) \
+    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epi16_mask(A, B) \
+    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epi16_mask(k, A, B) \
+    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epi16_mask(A, B) \
+    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epi16_mask(k, A, B) \
+    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epi16_mask(A, B) \
+    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epi16_mask(k, A, B) \
+    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epi16_mask(A, B) \
+    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epi16_mask(k, A, B) \
+    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
-                                                 (__mmask32)-1);
-}
+#define _mm_cmpeq_epu16_mask(A, B) \
+    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epu16_mask(k, A, B) \
+    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epu16_mask(A, B) \
+    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epu16_mask(k, A, B) \
+    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epu16_mask(A, B) \
+    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epu16_mask(k, A, B) \
+    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epu16_mask(A, B) \
+    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epu16_mask(k, A, B) \
+    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epu16_mask(A, B) \
+    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epu16_mask(k, A, B) \
+    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epu16_mask(A, B) \
+    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epu16_mask(k, A, B) \
+    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
-                                                 __u);
-}
+#define _mm256_cmpeq_epu16_mask(A, B) \
+    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epu16_mask(k, A, B) \
+    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epu16_mask(A, B) \
+    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epu16_mask(k, A, B) \
+    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epu16_mask(A, B) \
+    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epu16_mask(k, A, B) \
+    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epu16_mask(A, B) \
+    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epu16_mask(k, A, B) \
+    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epu16_mask(A, B) \
+    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epu16_mask(k, A, B) \
+    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epu16_mask(A, B) \
+    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epu16_mask(k, A, B) \
+    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
-                                               (__mmask8)-1);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_add_epi8(__A, __B),
+                                             (__v32qi)__W);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
-                                               __u);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_add_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
-                                                (__mmask8)-1);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_add_epi16(__A, __B),
+                                             (__v16hi)__W);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
-                                                __u);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_add_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
-                                                (__mmask16)-1);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_sub_epi8(__A, __B),
+                                             (__v32qi)__W);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
-                                                __u);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_sub_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
-                                                 (__mmask16)-1);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_sub_epi16(__A, __B),
+                                             (__v16hi)__W);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
-                                                 __u);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_sub_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
-                                                   (__mmask16)-1);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_add_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
-                                                   __u);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_add_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
-                                                 (__mmask16)-1);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_add_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
-                                                 __u);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_add_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
-                                                   (__mmask32)-1);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_sub_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
-                                                   __u);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_sub_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
-                                                 (__mmask32)-1);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sub_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
-                                                   (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
-                                                   __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
-                                                 __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
-                                                 __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
-                                                 __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
-  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
-  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
-                                                 __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
-                                                (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
-                                                 (__mmask16)-1);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
-  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
-                                                 __u);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
-  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
-             (__v32qi) __B,
-             (__v32qi) __W,
-             (__mmask32) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
-             (__v32qi) __B,
-             (__v32qi)
-             _mm256_setzero_si256 (),
-             (__mmask32) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
-             (__v16hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
-             (__v16hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
-             (__v32qi) __B,
-             (__v32qi) __W,
-             (__mmask32) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
-             (__v32qi) __B,
-             (__v32qi)
-             _mm256_setzero_si256 (),
-             (__mmask32) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
-             (__v16hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
-             (__v16hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
-}
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
-             (__v16qi) __B,
-             (__v16qi) __W,
-             (__mmask16) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
-             (__v16qi) __B,
-             (__v16qi)
-             _mm_setzero_si128 (),
-             (__mmask16) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
-             (__v16qi) __B,
-             (__v16qi) __W,
-             (__mmask16) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
-             (__v16qi) __B,
-             (__v16qi)
-             _mm_setzero_si128 (),
-             (__mmask16) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sub_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi) __W,
-              (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
+                                             (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
-              (__v8hi) __B,
-              (__v8hi) __W,
-              (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mullo_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
-              (__v8hi) __B,
-              (__v8hi)
-              _mm_setzero_si128 (),
-              (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mullo_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
 {
   return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
@@ -791,7 +462,7 @@
               (__v16qi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
 {
   return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
@@ -799,7 +470,7 @@
                (__v32qi) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
 {
   return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
@@ -807,7 +478,7 @@
                (__v8hi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
 {
   return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
@@ -815,2014 +486,1721 @@
                (__v16hi) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_abs_epi8(__A),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_abs_epi8(__A),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_abs_epi8(__A),
+                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_abs_epi8(__A),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_abs_epi16(__A),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_abs_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_abs_epi16(__A),
+                                             (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_abs_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packs_epi32(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
-               (__v4si) __B,
-               (__v8hi) _mm_setzero_si128 (), __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packs_epi32(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packs_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
-               (__v4si) __B,
-               (__v8hi) __W, __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                          (__v16hi)_mm256_packs_epi32(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
-               (__v8si) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                          (__v16hi)_mm256_packs_epi32(__A, __B),
+                                          (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
-       __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
-               (__v8si) __B,
-               (__v16hi) __W, __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_packs_epi16(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_packs_epi16(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v16qi) __W,
-               __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                          (__v32qi)_mm256_packs_epi16(__A, __B),
+                                          (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                          (__v32qi)_mm256_packs_epi16(__A, __B),
+                                          (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
-       __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v32qi) __W,
-               __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packus_epi32(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
-               (__v4si) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packus_epi32(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packus_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
-               (__v4si) __B,
-               (__v8hi) __W, __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                         (__v16hi)_mm256_packus_epi32(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
-               (__v8si) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                         (__v16hi)_mm256_packus_epi32(__A, __B),
+                                         (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
-        __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
-               (__v8si) __B,
-               (__v16hi) __W,
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                            (__v16qi)_mm_packus_epi16(__A, __B),
+                                            (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                            (__v16qi)_mm_packus_epi16(__A, __B),
+                                            (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v16qi) __W,
-               __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                         (__v32qi)_mm256_packus_epi16(__A, __B),
+                                         (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                         (__v32qi)_mm256_packus_epi16(__A, __B),
+                                         (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
-        __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v32qi) __W,
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epi8(__A, __B),
+                                            (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epi8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epu8(__A, __B),
+                                            (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epu8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epu16(__A, __B),
+                                           (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epu16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_avg_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
-       __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_avg_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_avg_epu8(__A, __B),
+                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
-          __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_avg_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_avg_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_avg_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                            (__v16hi)_mm256_avg_epu16(__A, __B),
+                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                            (__v16hi)_mm256_avg_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-       __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epi8(__A, __B),
+                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-          __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epi16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epi16(__A, __B),
+                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
-           __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
-       __m128i __B)
-{
-  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
-          __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epu8(__A, __B),
+                                             (__v32qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epu16(__A, __B),
+                                            (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-       __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-          __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epi8(__A, __B),
+                                             (__v32qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epi16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epi16(__A, __B),
+                                            (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
-       __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
-          __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epu8(__A, __B),
+                                             (__v32qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epu16(__A, __B),
+                                            (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
-           __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
+                                            (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
+                                            (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
-        __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
+                                         (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
+                                         (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epi8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epi8(__A, __B),
+                                            (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epi8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) __W,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epu8(__A, __B),
+                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
-               (__v16qi) __B,
-               (__v16qi) _mm_setzero_si128 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) __W,
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epu8(__A, __B),
+                                            (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
-               (__v32qi) __B,
-               (__v32qi) _mm256_setzero_si256 (),
-               (__mmask32) __U);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epu8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epu16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epu16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256 (),
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
+                                                 (__v8hi) __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
-            __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,
+                            __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
-               (__v8hi) __I /* idx */ ,
-               (__v8hi) __B,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128(__U,
+                                  (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
+                                  (__v8hi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
-         __mmask16 __U, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,
+                             __m128i __B)
 {
-  return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
-               (__v16hi) __I /* idx */ ,
-               (__v16hi) __B,
-               (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectw_128(__U,
+                                  (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
+                                  (__v8hi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
-               (__v8hi) __A,
-               (__v8hi) __B,
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
-           __m128i __B)
-{
-  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
-               (__v8hi) __A,
-               (__v8hi) __B,
-               (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
             __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I/* idx */,
-               (__v8hi) __A,
-               (__v8hi) __B,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128(__U,
+                                  (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
+                                  (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
-               (__v16hi) __A,
-               (__v16hi) __B,
-               (__mmask16) -1);
+  return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
+                                                 (__v16hi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
-        __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I,
+                               __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
-               (__v16hi) __A,
-               (__v16hi) __B,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256(__U,
+                              (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
+                              (__v16hi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
-         __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U,
+                                __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I/* idx */,
-               (__v16hi) __A,
-               (__v16hi) __B,
-               (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256(__U,
+                              (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
+                              (__v16hi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
-               (__v16qi) __Y,
-               (__v8hi) __W,
-               (__mmask8) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
+                                 __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__U,
+                              (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
+                              (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
-               (__v16qi) __Y,
-              (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
+                                            (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
-         __m256i __Y) {
-  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
-               (__v32qi) __Y,
-               (__v16hi) __W,
-               (__mmask16) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
+                                            (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
-               (__v32qi) __Y,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
+                          __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+                                        (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v4si) __W,
-               (__mmask8) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+                                        (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v4si) _mm_setzero_si128(),
-               (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_madd_epi16(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v8si) __W,
-               (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_madd_epi16(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v8si) _mm256_setzero_si256(),
-               (__mmask8) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_madd_epi16(__A, __B),
+                                            (__v8si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_madd_epi16(__A, __B),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtsepi16_epi8 (__m128i __A) {
   return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
                (__v16qi) _mm_setzero_si128(),
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
                (__v16qi) __O,
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
                (__v16qi) _mm_setzero_si128(),
                __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtsepi16_epi8 (__m256i __A) {
   return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
                (__v16qi) _mm_setzero_si128(),
                (__mmask16) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
   return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
                (__v16qi) __O,
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
   return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
                (__v16qi) _mm_setzero_si128(),
                __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtusepi16_epi8 (__m128i __A) {
   return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
                 (__v16qi) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
                 (__v16qi) __O,
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
                 (__v16qi) _mm_setzero_si128(),
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtusepi16_epi8 (__m256i __A) {
   return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
                 (__v16qi) _mm_setzero_si128(),
                 (__mmask16) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
   return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
                 (__v16qi) __O,
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
   return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
                 (__v16qi) _mm_setzero_si128(),
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtepi16_epi8 (__m128i __A) {
-
-  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
-               (__v16qi) _mm_setzero_si128(),
-               (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v8hi)__A, __v8qi),
+      (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+      12, 13, 14, 15);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
                (__v16qi) __O,
                __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
                (__v16qi) _mm_setzero_si128(),
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
 }
 
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi16_epi8 (__m256i __A) {
-  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
-               (__v16qi) _mm_setzero_si128(),
-               (__mmask16) -1);
+  return (__m128i)__builtin_convertvector((__v16hi) __A, __v16qi);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
-  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
-               (__v16qi) __O,
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm256_cvtepi16_epi8(__A),
+                                             (__v16qi)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
-  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
-               (__v16qi) _mm_setzero_si128(),
-               __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm256_cvtepi16_epi8(__A),
+                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
 {
   __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
 {
   __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
 {
   __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
 }
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
-               (__v8hi) __Y,
-               (__v8hi) __W,
-               (__mmask8) __U);
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
-               (__v8hi) __Y,
-              (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
-               (__v16hi) __Y,
-               (__v16hi) __W,
-               (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+                                         (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
-               (__v16hi) __Y,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+                                         (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
-          __m128i __B) {
-  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-              (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
-       __m256i __B) {
-  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
+                                          (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-          __m128i __B) {
-  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-               (__v8hi) __W,
-               (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
-               (__v8hi) __B,
-              (__v8hi) _mm_setzero_si128(),
-               (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-       __m256i __B) {
-  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) __W,
-               (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
+                                          (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
-               (__v16hi) __B,
-               (__v16hi) _mm256_setzero_si256(),
-               (__mmask16) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpackhi_epi8(__A, __B),
                                            (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpackhi_epi8(__A, __B),
                                            (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpackhi_epi8(__A, __B),
                                         (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpackhi_epi8(__A, __B),
                                         (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpackhi_epi16(__A, __B),
                                            (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpackhi_epi16(__A, __B),
                                            (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpackhi_epi16(__A, __B),
                                        (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpackhi_epi16(__A, __B),
                                        (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpacklo_epi8(__A, __B),
                                            (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpacklo_epi8(__A, __B),
                                            (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpacklo_epi8(__A, __B),
                                         (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpacklo_epi8(__A, __B),
                                         (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpacklo_epi16(__A, __B),
                                            (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpacklo_epi16(__A, __B),
                                            (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpacklo_epi16(__A, __B),
                                        (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpacklo_epi16(__A, __B),
                                        (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
-                (__v8hi) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepi8_epi16(__A),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
-                (__v8hi)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepi8_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
-                (__v16hi) __W,
-                (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
+                                             (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
-                (__v16hi)
-                _mm256_setzero_si256 (),
-                (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
-                (__v8hi) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepu8_epi16(__A),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
-                (__v8hi)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepu8_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
-                (__v16hi) __W,
-                (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
+                                             (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
-                (__v16hi)
-                _mm256_setzero_si256 (),
-                (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
 }
 
 
-#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
-                                         (__v16qi)(__m128i)(b), (int)(p), \
-                                         (__mmask16)-1); })
-
-#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
-                                         (__v16qi)(__m128i)(b), (int)(p), \
-                                         (__mmask16)(m)); })
-
-#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
-                                          (__v16qi)(__m128i)(b), (int)(p), \
-                                          (__mmask16)-1); })
-
-#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
-                                          (__v16qi)(__m128i)(b), (int)(p), \
-                                          (__mmask16)(m)); })
-
-#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
-                                         (__v32qi)(__m256i)(b), (int)(p), \
-                                         (__mmask32)-1); })
-
-#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
-                                         (__v32qi)(__m256i)(b), (int)(p), \
-                                         (__mmask32)(m)); })
-
-#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
-                                          (__v32qi)(__m256i)(b), (int)(p), \
-                                          (__mmask32)-1); })
-
-#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
-                                          (__v32qi)(__m256i)(b), (int)(p), \
-                                          (__mmask32)(m)); })
-
-#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \
-  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
-                                        (__v8hi)(__m128i)(b), (int)(p), \
-                                        (__mmask8)-1); })
-
-#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
-                                        (__v8hi)(__m128i)(b), (int)(p), \
-                                        (__mmask8)(m)); })
-
-#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \
-  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
-                                         (__v8hi)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1); })
-
-#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
-                                         (__v8hi)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m)); })
-
-#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
-                                         (__v16hi)(__m256i)(b), (int)(p), \
-                                         (__mmask16)-1); })
-
-#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
-                                         (__v16hi)(__m256i)(b), (int)(p), \
-                                         (__mmask16)(m)); })
-
-#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
-                                          (__v16hi)(__m256i)(b), (int)(p), \
-                                          (__mmask16)-1); })
-
-#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
-                                          (__v16hi)(__m256i)(b), (int)(p), \
-                                          (__mmask16)(m)); })
-
-#define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+#define _mm_mask_shufflehi_epi16(W, U, A, imm) \
   (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
-                                      (__v8hi)(__m128i)(W)); })
+                                      (__v8hi)(__m128i)(W))
 
-#define _mm_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+#define _mm_maskz_shufflehi_epi16(U, A, imm) \
   (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
-                                      (__v8hi)_mm_setzero_hi()); })
+                                      (__v8hi)_mm_setzero_si128())
 
-#define _mm256_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
   (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                       (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
-                                      (__v16hi)(__m256i)(W)); })
+                                      (__v16hi)(__m256i)(W))
 
-#define _mm256_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+#define _mm256_maskz_shufflehi_epi16(U, A, imm) \
   (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                       (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
-                                      (__v16hi)_mm256_setzero_si256()); })
+                                      (__v16hi)_mm256_setzero_si256())
 
-#define _mm_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+#define _mm_mask_shufflelo_epi16(W, U, A, imm) \
   (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
-                                      (__v8hi)(__m128i)(W)); })
+                                      (__v8hi)(__m128i)(W))
 
-#define _mm_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+#define _mm_maskz_shufflelo_epi16(U, A, imm) \
   (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
-                                      (__v8hi)_mm_setzero_hi()); })
+                                      (__v8hi)_mm_setzero_si128())
 
-#define _mm256_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
   (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                       (__v16hi)_mm256_shufflelo_epi16((A), \
                                                                       (imm)), \
-                                      (__v16hi)(__m256i)(W)); })
+                                      (__v16hi)(__m256i)(W))
 
-#define _mm256_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+#define _mm256_maskz_shufflelo_epi16(U, A, imm) \
   (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                       (__v16hi)_mm256_shufflelo_epi16((A), \
                                                                       (imm)), \
-                                      (__v16hi)_mm256_setzero_si256()); })
+                                      (__v16hi)_mm256_setzero_si256())
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_sllv_epi16 (__m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sllv_epi16(__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) -1);
+  return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi) __W,
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sllv_epi16 (__m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_sllv_epi16(__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_hi (),
-             (__mmask8) -1);
+  return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sllv_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sllv_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sll_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sll_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sll_epi16(__A, __B),
+                                          (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sll_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
-#define _mm_mask_slli_epi16(W, U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \
-                                         (__v8hi)(__m128i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm_maskz_slli_epi16(U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \
-                                         (__v8hi)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_mask_slli_epi16(W, U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \
-                                         (__v16hi)(__m256i)(W), \
-                                         (__mmask16)(U)); })
-
-#define _mm256_maskz_slli_epi16(U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \
-                                         (__v16hi)_mm256_setzero_si256(), \
-                                         (__mmask16)(U)); })
-
-
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srlv_epi16 (__m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) -1);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi) __W,
-              (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srlv_epi16 (__m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_hi (),
-             (__mmask8) -1);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srlv_epi16(__m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srav_epi16 (__m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) -1);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srlv_epi16(__m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi) __W,
-              (__mmask16) __U);
+  return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
-              (__v16hi) __B,
-              (__v16hi)
-              _mm256_setzero_si256 (),
-              (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srlv_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srav_epi16 (__m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_hi (),
-             (__mmask8) -1);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srlv_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srav_epi16(__m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srav_epi16(__A, __B),
+                                           (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srav_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srav_epi16(__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srav_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srav_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-#define _mm_mask_srai_epi16(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
-                                         (__v8hi)(__m128i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm_maskz_srai_epi16(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
-                                         (__v8hi)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_mask_srai_epi16(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
-                                         (__v16hi)(__m256i)(W), \
-                                         (__mmask16)(U)); })
-
-#define _mm256_maskz_srai_epi16(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
-                                         (__v16hi)_mm256_setzero_si256(), \
-                                         (__mmask16)(U)); })
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sra_epi16(__A, __B),
+                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sra_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sra_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sra_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srl_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
-             (__v8hi) __B,
-             (__v8hi)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srl_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi) __W,
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_srl_epi16(__A, __B),
+                                          (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
-             (__v8hi) __B,
-             (__v16hi)
-             _mm256_setzero_si256 (),
-             (__mmask16) __U);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_srl_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
-                                         (__v8hi)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srli_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
 
-#define _mm_maskz_srli_epi16(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
-                                         (__v8hi)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srli_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
 
-#define _mm256_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
-                                         (__v16hi)(__m256i)(W), \
-                                         (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srli_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
 
-#define _mm256_maskz_srli_epi16(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
-                                         (__v16hi)_mm256_setzero_si256(), \
-                                         (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srli_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
@@ -2830,15 +2208,15 @@
                 (__v8hi) __W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
                 (__v8hi) __A,
-                (__v8hi) _mm_setzero_hi ());
+                (__v8hi) _mm_setzero_si128 ());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
@@ -2846,7 +2224,7 @@
                 (__v16hi) __W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
@@ -2854,7 +2232,7 @@
                 (__v16hi) _mm256_setzero_si256 ());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
@@ -2862,15 +2240,15 @@
                 (__v16qi) __W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
                 (__v16qi) __A,
-                (__v16qi) _mm_setzero_hi ());
+                (__v16qi) _mm_setzero_si128 ());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
@@ -2878,7 +2256,7 @@
                 (__v32qi) __W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
@@ -2887,41 +2265,39 @@
 }
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
-                 (__v16qi) __O,
-                 __M);
+  return (__m128i) __builtin_ia32_selectb_128(__M,
+                                              (__v16qi) _mm_set1_epi8(__A),
+                                              (__v16qi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_set1_epi8 (__mmask16 __M, char __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
-                 (__v16qi)
-                 _mm_setzero_si128 (),
-                 __M);
+ return (__m128i) __builtin_ia32_selectb_128(__M,
+                                             (__v16qi) _mm_set1_epi8(__A),
+                                             (__v16qi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
-                 (__v32qi) __O,
-                 __M);
+  return (__m256i) __builtin_ia32_selectb_256(__M,
+                                              (__v32qi) _mm256_set1_epi8(__A),
+                                              (__v32qi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
-                 (__v32qi)
-                 _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i) __builtin_ia32_selectb_256(__M,
+                                              (__v32qi) _mm256_set1_epi8(__A),
+                                              (__v32qi) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
@@ -2929,16 +2305,16 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
                  (__v8hi)
-                 _mm_setzero_hi (),
+                 _mm_setzero_si128 (),
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
@@ -2946,7 +2322,7 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
@@ -2955,7 +2331,7 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
@@ -2963,7 +2339,7 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
@@ -2972,7 +2348,7 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
@@ -2980,7 +2356,7 @@
                  (__mmask32) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
@@ -2988,7 +2364,7 @@
                  _mm256_setzero_si256 (),
                  (__mmask32) __U);
 }
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
 {
   __builtin_ia32_storedquhi128_mask ((__v8hi *) __P,
@@ -2996,7 +2372,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
 {
   __builtin_ia32_storedquhi256_mask ((__v16hi *) __P,
@@ -3004,7 +2380,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
 {
   __builtin_ia32_storedquqi128_mask ((__v16qi *) __P,
@@ -3012,7 +2388,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
 {
   __builtin_ia32_storedquqi256_mask ((__v32qi *) __P,
@@ -3020,175 +2396,162 @@
              (__mmask32) __U);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
 _mm_test_epi8_mask (__m128i __A, __m128i __B)
 {
-  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
-            (__v16qi) __B,
-            (__mmask16) -1);
+  return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
 _mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
-            (__v16qi) __B, __U);
+  return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B),
+                                    _mm_setzero_si128());
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
 _mm256_test_epi8_mask (__m256i __A, __m256i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
-            (__v32qi) __B,
-            (__mmask32) -1);
+  return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B),
+                                  _mm256_setzero_si256());
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
 _mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
-            (__v32qi) __B, __U);
+  return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B),
+                                       _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_test_epi16_mask (__m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
-                 (__v8hi) __B,
-                 (__mmask8) -1);
+  return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
-                 (__v8hi) __B, __U);
+  return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B),
+                                     _mm_setzero_si128());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
 _mm256_test_epi16_mask (__m256i __A, __m256i __B)
 {
-  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
-            (__v16hi) __B,
-            (__mmask16) -1);
+  return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B),
+                                   _mm256_setzero_si256 ());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
 _mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
-            (__v16hi) __B, __U);
+  return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B),
+                                        _mm256_setzero_si256());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
 _mm_testn_epi8_mask (__m128i __A, __m128i __B)
 {
-  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
-             (__v16qi) __B,
-             (__mmask16) -1);
+  return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
 _mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
 {
-  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
-             (__v16qi) __B, __U);
+  return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B),
+                                  _mm_setzero_si128());
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
 _mm256_testn_epi8_mask (__m256i __A, __m256i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
-             (__v32qi) __B,
-             (__mmask32) -1);
+  return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B),
+                                 _mm256_setzero_si256());
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
 _mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
-             (__v32qi) __B, __U);
+  return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B),
+                                      _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_testn_epi16_mask (__m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
-            (__v8hi) __B,
-            (__mmask8) -1);
+  return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
-            (__v8hi) __B, __U);
+  return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
 _mm256_testn_epi16_mask (__m256i __A, __m256i __B)
 {
-  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
-             (__v16hi) __B,
-             (__mmask16) -1);
+  return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B),
+                                  _mm256_setzero_si256());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
 _mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
 {
-  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
-             (__v16hi) __B, __U);
+  return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B),
+                                       _mm256_setzero_si256());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
 _mm_movepi8_mask (__m128i __A)
 {
   return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
 _mm256_movepi8_mask (__m256i __A)
 {
   return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_movepi16_mask (__m128i __A)
 {
   return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
 _mm256_movepi16_mask (__m256i __A)
 {
   return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_movm_epi8 (__mmask16 __A)
 {
   return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_movm_epi8 (__mmask32 __A)
 {
   return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_movm_epi16 (__mmask8 __A)
 {
   return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_movm_epi16 (__mmask16 __A)
 {
   return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectb_128(__M,
@@ -3196,7 +2559,7 @@
                                              (__v16qi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectb_128(__M,
@@ -3204,7 +2567,7 @@
                                              (__v16qi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectb_256(__M,
@@ -3212,7 +2575,7 @@
                                              (__v32qi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectb_256(__M,
@@ -3220,7 +2583,7 @@
                                              (__v32qi) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128(__M,
@@ -3228,7 +2591,7 @@
                                              (__v8hi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128(__M,
@@ -3236,7 +2599,7 @@
                                              (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256(__M,
@@ -3244,7 +2607,7 @@
                                              (__v16hi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256(__M,
@@ -3252,155 +2615,134 @@
                                              (__v16hi) _mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
-                 (__v16hi) __O,
-                 __M);
+  return (__m256i) __builtin_ia32_selectw_256 (__M,
+                                               (__v16hi) _mm256_set1_epi16(__A),
+                                               (__v16hi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
-                 (__v16hi) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i) __builtin_ia32_selectw_256(__M,
+                                              (__v16hi)_mm256_set1_epi16(__A),
+                                              (__v16hi) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
-                 (__v8hi) __O,
-                 __M);
+  return (__m128i) __builtin_ia32_selectw_128(__M,
+                                              (__v8hi) _mm_set1_epi16(__A),
+                                              (__v8hi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_set1_epi16 (__mmask8 __M, short __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
-                 (__v8hi) _mm_setzero_si128 (),
-                 __M);
+  return (__m128i) __builtin_ia32_selectw_128(__M,
+                                              (__v8hi) _mm_set1_epi16(__A),
+                                              (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_permutexvar_epi16 (__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
-                 (__v8hi) __A,
-                 (__v8hi) _mm_undefined_si128 (),
-                 (__mmask8) -1);
+  return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
-                 (__v8hi) __A,
-                 (__v8hi) _mm_setzero_si128 (),
-                 (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                        (__v8hi)_mm_permutexvar_epi16(__A, __B),
+                                        (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
           __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
-                 (__v8hi) __A,
-                 (__v8hi) __W,
-                 (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                        (__v8hi)_mm_permutexvar_epi16(__A, __B),
+                                        (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
-                 (__v16hi) __A,
-                 (__v16hi) _mm256_undefined_si256 (),
-                 (__mmask16) -1);
+  return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
         __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
-                 (__v16hi) __A,
-                 (__v16hi) _mm256_setzero_si256 (),
-                 (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                    (__v16hi)_mm256_permutexvar_epi16(__A, __B),
+                                    (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
              __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
-                 (__v16hi) __A,
-                 (__v16hi) __W,
-                 (__mmask16) __M);
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                    (__v16hi)_mm256_permutexvar_epi16(__A, __B),
+                                    (__v16hi)__W);
 }
 
-#define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \
-                                          (__v16qi)(__m128i)(B), (int)(N), \
-                                          (__v16qi)(__m128i)(W), \
-                                          (__mmask16)(U)); })
+#define _mm_mask_alignr_epi8(W, U, A, B, N) \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+                                 (__v16qi)(__m128i)(W))
 
-#define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \
-                                          (__v16qi)(__m128i)(B), (int)(N), \
-                                          (__v16qi)_mm_setzero_si128(), \
-                                          (__mmask16)(U)); })
+#define _mm_maskz_alignr_epi8(U, A, B, N) \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+                                 (__v16qi)_mm_setzero_si128())
 
-#define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
-  (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \
-                                          (__v32qi)(__m256i)(B), (int)(N), \
-                                          (__v32qi)(__m256i)(W), \
-                                          (__mmask32)(U)); })
+#define _mm256_mask_alignr_epi8(W, U, A, B, N) \
+  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+                              (__v32qi)(__m256i)(W))
 
-#define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
-  (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \
-                                          (__v32qi)(__m256i)(B), (int)(N), \
-                                          (__v32qi)_mm256_setzero_si256(), \
-                                          (__mmask32)(U)); })
+#define _mm256_maskz_alignr_epi8(U, A, B, N) \
+  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+                              (__v32qi)_mm256_setzero_si256())
 
-#define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
-                                           (__v16qi)(__m128i)(B), (int)(imm), \
-                                           (__v8hi)_mm_setzero_hi(), \
-                                           (__mmask8)-1); })
+#define _mm_dbsad_epu8(A, B, imm) \
+  (__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
+                                      (__v16qi)(__m128i)(B), (int)(imm))
 
-#define _mm_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
-                                           (__v16qi)(__m128i)(B), (int)(imm), \
-                                           (__v8hi)(__m128i)(W), \
-                                           (__mmask8)(U)); })
+#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
+                                      (__v8hi)(__m128i)(W))
 
-#define _mm_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
-                                           (__v16qi)(__m128i)(B), (int)(imm), \
-                                           (__v8hi)_mm_setzero_si128(), \
-                                           (__mmask8)(U)); })
+#define _mm_maskz_dbsad_epu8(U, A, B, imm) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
+                                      (__v8hi)_mm_setzero_si128())
 
-#define _mm256_dbsad_epu8(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
-                                           (__v32qi)(__m256i)(B), (int)(imm), \
-                                           (__v16hi)_mm256_setzero_si256(), \
-                                           (__mmask16)-1); })
+#define _mm256_dbsad_epu8(A, B, imm) \
+  (__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
+                                      (__v32qi)(__m256i)(B), (int)(imm))
 
-#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
-                                           (__v32qi)(__m256i)(B), (int)(imm), \
-                                           (__v16hi)(__m256i)(W), \
-                                           (__mmask16)(U)); })
+#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                  (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
+                                  (__v16hi)(__m256i)(W))
 
-#define _mm256_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
-                                           (__v32qi)(__m256i)(B), (int)(imm), \
-                                           (__v16hi)_mm256_setzero_si256(), \
-                                           (__mmask16)(U)); })
+#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                  (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
+                                  (__v16hi)_mm256_setzero_si256())
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif /* __AVX512VLBWINTRIN_H */
diff --git a/darwin-x86/clang-headers/avx512vlcdintrin.h b/darwin-x86/clang-headers/avx512vlcdintrin.h
index 7b02e2e..903a7c2 100644
--- a/darwin-x86/clang-headers/avx512vlcdintrin.h
+++ b/darwin-x86/clang-headers/avx512vlcdintrin.h
@@ -1,4 +1,4 @@
-/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -28,35 +28,36 @@
 #define __AVX512VLCDINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastmb_epi64 (__mmask8 __A)
 {
-  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+  return (__m128i) _mm_set1_epi64x((long long) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastmb_epi64 (__mmask8 __A)
 {
-  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+  return (__m256i) _mm256_set1_epi64x((long long)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastmw_epi32 (__mmask16 __A)
 {
-  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+  return (__m128i) _mm_set1_epi32((int)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastmw_epi32 (__mmask16 __A)
 {
-  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+  return (__m256i) _mm256_set1_epi32((int)__A);
 }
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_conflict_epi64 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
@@ -64,7 +65,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
@@ -72,16 +73,16 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
                (__v2di)
-               _mm_setzero_di (),
+               _mm_setzero_si128 (),
                (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_conflict_epi64 (__m256i __A)
 {
   return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@@ -89,7 +90,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@@ -97,7 +98,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@@ -105,7 +106,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_conflict_epi32 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@@ -113,7 +114,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@@ -121,7 +122,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@@ -129,7 +130,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_conflict_epi32 (__m256i __A)
 {
   return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@@ -137,7 +138,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@@ -145,7 +146,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@@ -154,110 +155,95 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_lzcnt_epi32 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-                 (__v4si) __W,
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_lzcnt_epi32(__A),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_lzcnt_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_lzcnt_epi32 (__m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-                 (__v8si) __W,
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_lzcnt_epi32(__A),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_lzcnt_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_lzcnt_epi64 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-                 (__v2di)
-                 _mm_setzero_di (),
-                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-                 (__v2di) __W,
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_lzcnt_epi64(__A),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-                 (__v2di)
-                 _mm_setzero_di (),
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_lzcnt_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_lzcnt_epi64 (__m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-                 (__v4di)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-                 (__v4di) __W,
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_lzcnt_epi64(__A),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-                 (__v4di)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_lzcnt_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif /* __AVX512VLCDINTRIN_H */
diff --git a/darwin-x86/clang-headers/avx512vldqintrin.h b/darwin-x86/clang-headers/avx512vldqintrin.h
index 8187bcd..9d13846 100644
--- a/darwin-x86/clang-headers/avx512vldqintrin.h
+++ b/darwin-x86/clang-headers/avx512vldqintrin.h
@@ -29,1237 +29,1157 @@
 #define __AVX512VLDQINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256)))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
   return (__m256i) ((__v4du) __A * (__v4du) __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W,
-              (__mmask8) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_mullo_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_mullo_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mullo_epi64 (__m128i __A, __m128i __B) {
   return (__m128i) ((__v2du) __A * (__v2du) __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W,
-              (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_mullo_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_mullo_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
-              (__v4df) __B,
-              (__v4df) __W,
-              (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_andnot_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
-              (__v4df) __B,
-              (__v4df)
-              _mm256_setzero_pd (),
-              (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_andnot_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
-              (__v2df) __B,
-              (__v2df) __W,
-              (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_andnot_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
-              (__v2df) __B,
-              (__v2df)
-              _mm_setzero_pd (),
-              (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_andnot_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
-             (__v8sf) __B,
-             (__v8sf) __W,
-             (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_andnot_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
-             (__v8sf) __B,
-             (__v8sf)
-             _mm256_setzero_ps (),
-             (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_andnot_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
-             (__v4sf) __B,
-             (__v4sf) __W,
-             (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_andnot_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
-             (__v4sf) __B,
-             (__v4sf)
-             _mm_setzero_ps (),
-             (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_andnot_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_and_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_and_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df) __W,
-             (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_and_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df)
-             _mm_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_and_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_and_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_and_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf) __W,
-            (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_and_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_and_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_xor_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_xor_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df) __W,
-             (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_xor_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df)
-             _mm_setzero_pd (),
-             (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_xor_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_xor_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_xor_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf) __W,
-            (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_xor_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_xor_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
-            (__v4df) __B,
-            (__v4df) __W,
-            (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_or_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
-            (__v4df) __B,
-            (__v4df)
-            _mm256_setzero_pd (),
-            (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_or_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
-            (__v2df) __B,
-            (__v2df) __W,
-            (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_or_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
-            (__v2df) __B,
-            (__v2df)
-            _mm_setzero_pd (),
-            (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_or_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
-                 (__v8sf) __B,
-                 (__v8sf) __W,
-                 (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_or_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
-                 (__v8sf) __B,
-                 (__v8sf)
-                 _mm256_setzero_ps (),
-                 (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_or_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
-                 (__v4sf) __B,
-                 (__v4sf) __W,
-                 (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_or_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
-                 (__v4sf) __B,
-                 (__v4sf)
-                 _mm_setzero_ps (),
-                 (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_or_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtpd_epi64 (__m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtpd_epi64 (__m256d __A) {
   return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
   return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
   return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtpd_epu64 (__m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtpd_epu64 (__m256d __A) {
   return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
   return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
   return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtps_epi64 (__m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtps_epi64 (__m128 __A) {
   return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
   return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
   return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtps_epu64 (__m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtps_epu64 (__m128 __A) {
   return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
   return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
   return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_cvtepi64_pd (__m128i __A) {
-  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
-                (__v2df) _mm_setzero_pd(),
-                (__mmask8) -1);
+  return (__m128d)__builtin_convertvector((__v2di)__A, __v2df);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
-                (__v2df) __W,
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_cvtepi64_pd(__A),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
-                (__v2df) _mm_setzero_pd(),
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_cvtepi64_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_cvtepi64_pd (__m256i __A) {
-  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
-                (__v4df) _mm256_setzero_pd(),
-                (__mmask8) -1);
+  return (__m256d)__builtin_convertvector((__v4di)__A, __v4df);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
-  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
-                (__v4df) __W,
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_cvtepi64_pd(__A),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
-  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
-                (__v4df) _mm256_setzero_pd(),
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_cvtepi64_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtepi64_ps (__m128i __A) {
   return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
                 (__v4sf) _mm_setzero_ps(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
   return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
                 (__v4sf) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
   return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
                 (__v4sf) _mm_setzero_ps(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_cvtepi64_ps (__m256i __A) {
   return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
                 (__v4sf) _mm_setzero_ps(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
   return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
                 (__v4sf) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
   return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
                 (__v4sf) _mm_setzero_ps(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvttpd_epi64 (__m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvttpd_epi64 (__m256d __A) {
   return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
   return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
   return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvttpd_epu64 (__m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvttpd_epu64 (__m256d __A) {
   return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
   return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
   return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvttps_epi64 (__m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvttps_epi64 (__m128 __A) {
   return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
   return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
   return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvttps_epu64 (__m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
                 (__v2di) _mm_setzero_si128(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvttps_epu64 (__m128 __A) {
   return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
   return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
   return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
                 (__v4di) _mm256_setzero_si256(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_cvtepu64_pd (__m128i __A) {
-  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
-                (__v2df) _mm_setzero_pd(),
-                (__mmask8) -1);
+  return (__m128d)__builtin_convertvector((__v2du)__A, __v2df);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
-                (__v2df) __W,
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_cvtepu64_pd(__A),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
-                (__v2df) _mm_setzero_pd(),
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_cvtepu64_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_cvtepu64_pd (__m256i __A) {
-  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
-                (__v4df) _mm256_setzero_pd(),
-                (__mmask8) -1);
+  return (__m256d)__builtin_convertvector((__v4du)__A, __v4df);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
-  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
-                (__v4df) __W,
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_cvtepu64_pd(__A),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
-  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
-                (__v4df) _mm256_setzero_pd(),
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_cvtepu64_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtepu64_ps (__m128i __A) {
   return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
                 (__v4sf) _mm_setzero_ps(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
   return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
                 (__v4sf) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
   return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
                 (__v4sf) _mm_setzero_ps(),
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_cvtepu64_ps (__m256i __A) {
   return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
                 (__v4sf) _mm_setzero_ps(),
                 (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
   return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
                 (__v4sf) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
   return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
                 (__v4sf) _mm_setzero_ps(),
                 (__mmask8) __U);
 }
 
-#define _mm_range_pd(A, B, C) __extension__ ({                         \
+#define _mm_range_pd(A, B, C) \
   (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), (int)(C), \
                                           (__v2df)_mm_setzero_pd(), \
-                                          (__mmask8)-1); })
+                                          (__mmask8)-1)
 
-#define _mm_mask_range_pd(W, U, A, B, C) __extension__ ({          \
+#define _mm_mask_range_pd(W, U, A, B, C) \
   (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), (int)(C), \
                                           (__v2df)(__m128d)(W), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-#define _mm_maskz_range_pd(U, A, B, C) __extension__ ({              \
+#define _mm_maskz_range_pd(U, A, B, C) \
   (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), (int)(C), \
                                           (__v2df)_mm_setzero_pd(), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-#define _mm256_range_pd(A, B, C) __extension__ ({                      \
+#define _mm256_range_pd(A, B, C) \
   (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
                                           (__v4df)(__m256d)(B), (int)(C), \
                                           (__v4df)_mm256_setzero_pd(), \
-                                          (__mmask8)-1); })
+                                          (__mmask8)-1)
 
-#define _mm256_mask_range_pd(W, U, A, B, C) __extension__ ({       \
+#define _mm256_mask_range_pd(W, U, A, B, C) \
   (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
                                           (__v4df)(__m256d)(B), (int)(C), \
                                           (__v4df)(__m256d)(W), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-#define _mm256_maskz_range_pd(U, A, B, C) __extension__ ({           \
+#define _mm256_maskz_range_pd(U, A, B, C) \
   (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
                                           (__v4df)(__m256d)(B), (int)(C), \
                                           (__v4df)_mm256_setzero_pd(), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-#define _mm_range_ps(A, B, C) __extension__ ({                         \
+#define _mm_range_ps(A, B, C) \
   (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), (int)(C), \
                                          (__v4sf)_mm_setzero_ps(), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm_mask_range_ps(W, U, A, B, C) __extension__ ({          \
+#define _mm_mask_range_ps(W, U, A, B, C) \
   (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), (int)(C), \
-                                         (__v4sf)(__m128)(W), (__mmask8)(U)); })
+                                         (__v4sf)(__m128)(W), (__mmask8)(U))
 
-#define _mm_maskz_range_ps(U, A, B, C) __extension__ ({              \
+#define _mm_maskz_range_ps(U, A, B, C) \
   (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), (int)(C), \
                                          (__v4sf)_mm_setzero_ps(), \
-                                         (__mmask8)(U)); })
+                                         (__mmask8)(U))
 
-#define _mm256_range_ps(A, B, C) __extension__ ({                      \
+#define _mm256_range_ps(A, B, C) \
   (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
                                          (__v8sf)(__m256)(B), (int)(C), \
                                          (__v8sf)_mm256_setzero_ps(), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm256_mask_range_ps(W, U, A, B, C) __extension__ ({       \
+#define _mm256_mask_range_ps(W, U, A, B, C) \
   (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
                                          (__v8sf)(__m256)(B), (int)(C), \
-                                         (__v8sf)(__m256)(W), (__mmask8)(U)); })
+                                         (__v8sf)(__m256)(W), (__mmask8)(U))
 
-#define _mm256_maskz_range_ps(U, A, B, C) __extension__ ({           \
+#define _mm256_maskz_range_ps(U, A, B, C) \
   (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
                                          (__v8sf)(__m256)(B), (int)(C), \
                                          (__v8sf)_mm256_setzero_ps(), \
-                                         (__mmask8)(U)); })
+                                         (__mmask8)(U))
 
-#define _mm_reduce_pd(A, B) __extension__ ({                \
+#define _mm_reduce_pd(A, B) \
   (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1); })
+                                           (__mmask8)-1)
 
-#define _mm_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+#define _mm_mask_reduce_pd(W, U, A, B) \
   (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
                                            (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm_maskz_reduce_pd(U, A, B) __extension__ ({     \
+#define _mm_maskz_reduce_pd(U, A, B) \
   (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm256_reduce_pd(A, B) __extension__ ({                \
+#define _mm256_reduce_pd(A, B) \
   (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
                                            (__v4df)_mm256_setzero_pd(), \
-                                           (__mmask8)-1); })
+                                           (__mmask8)-1)
 
-#define _mm256_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+#define _mm256_mask_reduce_pd(W, U, A, B) \
   (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
                                            (__v4df)(__m256d)(W), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm256_maskz_reduce_pd(U, A, B) __extension__ ({     \
+#define _mm256_maskz_reduce_pd(U, A, B) \
   (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
                                            (__v4df)_mm256_setzero_pd(), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm_reduce_ps(A, B) __extension__ ({                   \
+#define _mm_reduce_ps(A, B) \
   (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1); })
+                                          (__mmask8)-1)
 
-#define _mm_mask_reduce_ps(W, U, A, B) __extension__ ({    \
+#define _mm_mask_reduce_ps(W, U, A, B) \
   (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
                                           (__v4sf)(__m128)(W), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-#define _mm_maskz_reduce_ps(U, A, B) __extension__ ({        \
+#define _mm_maskz_reduce_ps(U, A, B) \
   (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-#define _mm256_reduce_ps(A, B) __extension__ ({                \
+#define _mm256_reduce_ps(A, B) \
   (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1); })
+                                          (__mmask8)-1)
 
-#define _mm256_mask_reduce_ps(W, U, A, B) __extension__ ({ \
+#define _mm256_mask_reduce_ps(W, U, A, B) \
   (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
                                           (__v8sf)(__m256)(W), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-#define _mm256_maskz_reduce_ps(U, A, B) __extension__ ({     \
+#define _mm256_maskz_reduce_ps(U, A, B) \
   (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U)); })
+                                          (__mmask8)(U))
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_movepi32_mask (__m128i __A)
 {
   return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_movepi32_mask (__m256i __A)
 {
   return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_movm_epi32 (__mmask8 __A)
 {
   return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_movm_epi32 (__mmask8 __A)
 {
   return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_movm_epi64 (__mmask8 __A)
 {
   return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_movm_epi64 (__mmask8 __A)
 {
   return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_movepi64_mask (__m128i __A)
 {
   return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_movepi64_mask (__m256i __A)
 {
   return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_broadcast_f32x2 (__m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
-                (__v8sf)_mm256_undefined_ps(),
-                (__mmask8) -1);
+  return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+                                         0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
-                (__v8sf) __O,
-                __M);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+                                             (__v8sf)_mm256_broadcast_f32x2(__A),
+                                             (__v8sf)__O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
-                (__v8sf) _mm256_setzero_ps (),
-                __M);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+                                             (__v8sf)_mm256_broadcast_f32x2(__A),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_broadcast_f64x2 (__m128d __A)
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_broadcast_f64x2(__m128d __A)
 {
-  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
-                 (__v4df)_mm256_undefined_pd(),
-                 (__mmask8) -1);
+  return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
+                                          0, 1, 0, 1);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
 {
-  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
-                 (__v4df) __O,
-                 __M);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
+                                            (__v4df)_mm256_broadcast_f64x2(__A),
+                                            (__v4df)__O);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
 {
-  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
-                 (__v4df) _mm256_setzero_ps (),
-                 __M);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
+                                            (__v4df)_mm256_broadcast_f64x2(__A),
+                                            (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcast_i32x2 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
-                 (__v4si)_mm_undefined_si128(),
-                 (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+                                          0, 1, 0, 1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
-                 (__v4si) __O,
-                 __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_broadcast_i32x2(__A),
+                                             (__v4si)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
-                 (__v4si) _mm_setzero_si128 (),
-                 __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_broadcast_i32x2(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcast_i32x2 (__m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
-                 (__v8si)_mm256_undefined_si256(),
-                 (__mmask8) -1);
+  return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
-                 (__v8si) __O,
-                 __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_broadcast_i32x2(__A),
+                                             (__v8si)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
-                 (__v8si) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_broadcast_i32x2(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_broadcast_i64x2 (__m128i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcast_i64x2(__m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
-                 (__v4di)_mm256_undefined_si256(),
-                 (__mmask8) -1);
+  return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
+                                          0, 1, 0, 1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
-                 (__v4di) __O,
-                 __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                            (__v4di)_mm256_broadcast_i64x2(__A),
+                                            (__v4di)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
-                 (__v4di) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                            (__v4di)_mm256_broadcast_i64x2(__A),
+                                            (__v4di)_mm256_setzero_si256());
 }
 
-#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \
+#define _mm256_extractf64x2_pd(A, imm) \
   (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
                                                 (int)(imm), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1); })
+                                                (__v2df)_mm_undefined_pd(), \
+                                                (__mmask8)-1)
 
-#define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
+#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
   (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U)); })
+                                                (__mmask8)(U))
 
-#define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
+#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
   (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U)); })
+                                                (__mmask8)(U))
 
-#define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \
+#define _mm256_extracti64x2_epi64(A, imm) \
   (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
                                                 (int)(imm), \
-                                                (__v2di)_mm_setzero_di(), \
-                                                (__mmask8)-1); })
+                                                (__v2di)_mm_undefined_si128(), \
+                                                (__mmask8)-1)
 
-#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
+#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
   (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
                                                 (int)(imm), \
                                                 (__v2di)(__m128i)(W), \
-                                                (__mmask8)(U)); })
+                                                (__mmask8)(U))
 
-#define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
+#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
   (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
                                                 (int)(imm), \
-                                                (__v2di)_mm_setzero_di(), \
-                                                (__mmask8)(U)); })
+                                                (__v2di)_mm_setzero_si128(), \
+                                                (__mmask8)(U))
 
-#define _mm256_insertf64x2(A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v4df)_mm256_setzero_pd(), \
-                                               (__mmask8)-1); })
+#define _mm256_insertf64x2(A, B, imm) \
+  (__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(imm))
 
-#define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v4df)(__m256d)(W), \
-                                               (__mmask8)(U)); })
+#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+                                  (__v4df)(__m256d)(W))
 
-#define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(imm), \
-                                               (__v4df)_mm256_setzero_pd(), \
-                                               (__mmask8)(U)); })
+#define _mm256_maskz_insertf64x2(U, A, B, imm) \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+                                  (__v4df)_mm256_setzero_pd())
 
-#define _mm256_inserti64x2(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v4di)_mm256_setzero_si256(), \
-                                               (__mmask8)-1); })
+#define _mm256_inserti64x2(A, B, imm) \
+  (__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
+                                          (__v2di)(__m128i)(B), (int)(imm))
 
-#define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v4di)(__m256i)(W), \
-                                               (__mmask8)(U)); })
+#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                  (__v4di)(__m256i)(W))
 
-#define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
-                                               (__v2di)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v4di)_mm256_setzero_si256(), \
-                                               (__mmask8)(U)); })
+#define _mm256_maskz_inserti64x2(U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                  (__v4di)_mm256_setzero_si256())
 
-#define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+#define _mm_mask_fpclass_pd_mask(U, A, imm) \
   (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm_fpclass_pd_mask(A, imm) __extension__ ({ \
+#define _mm_fpclass_pd_mask(A, imm) \
   (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
-#define _mm256_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+#define _mm256_mask_fpclass_pd_mask(U, A, imm) \
   (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm256_fpclass_pd_mask(A, imm) __extension__ ({ \
+#define _mm256_fpclass_pd_mask(A, imm) \
   (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
-#define _mm_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+#define _mm_mask_fpclass_ps_mask(U, A, imm) \
   (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm_fpclass_ps_mask(A, imm) __extension__ ({ \
+#define _mm_fpclass_ps_mask(A, imm) \
   (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
-#define _mm256_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+#define _mm256_mask_fpclass_ps_mask(U, A, imm) \
   (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm256_fpclass_ps_mask(A, imm) __extension__ ({ \
+#define _mm256_fpclass_ps_mask(A, imm) \
   (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif
diff --git a/darwin-x86/clang-headers/avx512vlintrin.h b/darwin-x86/clang-headers/avx512vlintrin.h
index 295ce29..0ee1d00 100644
--- a/darwin-x86/clang-headers/avx512vlintrin.h
+++ b/darwin-x86/clang-headers/avx512vlintrin.h
@@ -28,868 +28,440 @@
 #ifndef __AVX512VLINTRIN_H
 #define __AVX512VLINTRIN_H
 
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256)))
 
-/* Doesn't require avx512vl, used in avx512dqintrin.h */
-static  __inline __m128i __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
-_mm_setzero_di(void) {
-  return (__m128i)(__v2di){ 0LL, 0LL};
-}
+typedef short __v2hi __attribute__((__vector_size__(4)));
+typedef char __v4qi __attribute__((__vector_size__(4)));
+typedef char __v2qi __attribute__((__vector_size__(2)));
 
 /* Integer compare */
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
-                                                  (__mmask8)-1);
-}
+#define _mm_cmpeq_epi32_mask(A, B) \
+    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epi32_mask(k, A, B) \
+    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epi32_mask(A, B) \
+    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epi32_mask(k, A, B) \
+    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epi32_mask(A, B) \
+    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epi32_mask(k, A, B) \
+    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epi32_mask(A, B) \
+    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epi32_mask(k, A, B) \
+    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epi32_mask(A, B) \
+    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epi32_mask(k, A, B) \
+    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epi32_mask(A, B) \
+    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epi32_mask(k, A, B) \
+    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
-                                                  __u);
-}
+#define _mm256_cmpeq_epi32_mask(A, B) \
+    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \
+    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epi32_mask(A, B) \
+    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epi32_mask(k, A, B) \
+    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epi32_mask(A, B) \
+    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \
+    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epi32_mask(A, B) \
+    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epi32_mask(k, A, B) \
+    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epi32_mask(A, B) \
+    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epi32_mask(k, A, B) \
+    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epi32_mask(A, B) \
+    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \
+    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
-                                                (__mmask8)-1);
-}
+#define _mm_cmpeq_epu32_mask(A, B) \
+    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epu32_mask(k, A, B) \
+    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epu32_mask(A, B) \
+    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epu32_mask(k, A, B) \
+    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epu32_mask(A, B) \
+    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epu32_mask(k, A, B) \
+    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epu32_mask(A, B) \
+    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epu32_mask(k, A, B) \
+    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epu32_mask(A, B) \
+    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epu32_mask(k, A, B) \
+    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epu32_mask(A, B) \
+    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epu32_mask(k, A, B) \
+    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
-                                                __u);
-}
+#define _mm256_cmpeq_epu32_mask(A, B) \
+    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \
+    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epu32_mask(A, B) \
+    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epu32_mask(k, A, B) \
+    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epu32_mask(A, B) \
+    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \
+    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epu32_mask(A, B) \
+    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epu32_mask(k, A, B) \
+    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epu32_mask(A, B) \
+    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epu32_mask(k, A, B) \
+    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epu32_mask(A, B) \
+    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \
+    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
-                                                  (__mmask8)-1);
-}
+#define _mm_cmpeq_epi64_mask(A, B) \
+    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epi64_mask(k, A, B) \
+    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epi64_mask(A, B) \
+    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epi64_mask(k, A, B) \
+    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epi64_mask(A, B) \
+    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epi64_mask(k, A, B) \
+    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epi64_mask(A, B) \
+    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epi64_mask(k, A, B) \
+    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epi64_mask(A, B) \
+    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epi64_mask(k, A, B) \
+    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epi64_mask(A, B) \
+    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epi64_mask(k, A, B) \
+    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
-                                                  __u);
-}
+#define _mm256_cmpeq_epi64_mask(A, B) \
+    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \
+    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epi64_mask(A, B) \
+    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epi64_mask(k, A, B) \
+    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epi64_mask(A, B) \
+    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \
+    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epi64_mask(A, B) \
+    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epi64_mask(k, A, B) \
+    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epi64_mask(A, B) \
+    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epi64_mask(k, A, B) \
+    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epi64_mask(A, B) \
+    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \
+    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
-                                                (__mmask8)-1);
-}
+#define _mm_cmpeq_epu64_mask(A, B) \
+    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epu64_mask(k, A, B) \
+    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epu64_mask(A, B) \
+    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epu64_mask(k, A, B) \
+    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epu64_mask(A, B) \
+    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epu64_mask(k, A, B) \
+    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epu64_mask(A, B) \
+    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epu64_mask(k, A, B) \
+    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epu64_mask(A, B) \
+    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epu64_mask(k, A, B) \
+    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epu64_mask(A, B) \
+    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epu64_mask(k, A, B) \
+    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
-                                                __u);
-}
+#define _mm256_cmpeq_epu64_mask(A, B) \
+    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \
+    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epu64_mask(A, B) \
+    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epu64_mask(k, A, B) \
+    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epu64_mask(A, B) \
+    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \
+    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epu64_mask(A, B) \
+    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epu64_mask(k, A, B) \
+    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epu64_mask(A, B) \
+    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epu64_mask(k, A, B) \
+    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epu64_mask(A, B) \
+    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \
+    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
-                                                __u);
-}
-
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
-                                                  (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
-                                                  __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
-                                               (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
-                                               __u);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
-                                                (__mmask8)-1);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
-  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
-                                                __u);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_add_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_add_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_add_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_add_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sub_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
-             (__v8si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sub_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sub_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
-             (__v4di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sub_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_add_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_add_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_add_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_add_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sub_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sub_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sub_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sub_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
-           __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
-              (__v8si) __Y,
-              (__v4di) __W, __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epi32(__X, __Y),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
-              (__v8si) __Y,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epi32(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
-        __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
-              (__v4si) __Y,
-              (__v2di) __W, __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epi32(__X, __Y),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
-              (__v4si) __Y,
-              (__v2di)
-              _mm_setzero_si128 (),
-              __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epi32(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
-           __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
-               (__v8si) __Y,
-               (__v4di) __W, __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epu32(__X, __Y),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
-               (__v8si) __Y,
-               (__v4di)
-               _mm256_setzero_si256 (),
-               __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epu32(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
-        __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
-               (__v4si) __Y,
-               (__v2di) __W, __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epu32(__X, __Y),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
-               (__v4si) __Y,
-               (__v2di)
-               _mm_setzero_si128 (),
-               __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epu32(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_mullo_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
-       __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_mullo_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_mullo_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_mullo_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
@@ -897,13 +469,13 @@
                                              (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
@@ -911,13 +483,13 @@
                                              (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
@@ -925,14 +497,14 @@
                                           (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(),
                                            __U, __A, __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
@@ -940,13 +512,13 @@
                                              (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
@@ -954,13 +526,13 @@
                                              (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
@@ -968,13 +540,13 @@
                                              (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
@@ -982,13 +554,13 @@
                                              (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -997,13 +569,13 @@
                                              (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
@@ -1011,13 +583,13 @@
                                              (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
@@ -1025,13 +597,13 @@
                                              (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
@@ -1039,14 +611,14 @@
                                           (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(),
                                            __U, __A, __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
@@ -1054,13 +626,13 @@
                                              (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
@@ -1068,13 +640,13 @@
                                              (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
@@ -1082,13 +654,13 @@
                                              (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
@@ -1096,13 +668,13 @@
                                              (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -1111,921 +683,973 @@
                                              (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B);
 }
 
-#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \
+#define _mm_cmp_epi32_mask(a, b, p) \
   (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
                                         (__v4si)(__m128i)(b), (int)(p), \
-                                        (__mmask8)-1); })
+                                        (__mmask8)-1)
 
-#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+#define _mm_mask_cmp_epi32_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
                                         (__v4si)(__m128i)(b), (int)(p), \
-                                        (__mmask8)(m)); })
+                                        (__mmask8)(m))
 
-#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \
+#define _mm_cmp_epu32_mask(a, b, p) \
   (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
                                          (__v4si)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+#define _mm_mask_cmp_epu32_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
                                          (__v4si)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \
+#define _mm256_cmp_epi32_mask(a, b, p) \
   (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
                                         (__v8si)(__m256i)(b), (int)(p), \
-                                        (__mmask8)-1); })
+                                        (__mmask8)-1)
 
-#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+#define _mm256_mask_cmp_epi32_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
                                         (__v8si)(__m256i)(b), (int)(p), \
-                                        (__mmask8)(m)); })
+                                        (__mmask8)(m))
 
-#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \
+#define _mm256_cmp_epu32_mask(a, b, p) \
   (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
                                          (__v8si)(__m256i)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+#define _mm256_mask_cmp_epu32_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
                                          (__v8si)(__m256i)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \
+#define _mm_cmp_epi64_mask(a, b, p) \
   (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
                                         (__v2di)(__m128i)(b), (int)(p), \
-                                        (__mmask8)-1); })
+                                        (__mmask8)-1)
 
-#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+#define _mm_mask_cmp_epi64_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
                                         (__v2di)(__m128i)(b), (int)(p), \
-                                        (__mmask8)(m)); })
+                                        (__mmask8)(m))
 
-#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \
+#define _mm_cmp_epu64_mask(a, b, p) \
   (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
                                          (__v2di)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+#define _mm_mask_cmp_epu64_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
                                          (__v2di)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \
+#define _mm256_cmp_epi64_mask(a, b, p) \
   (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
                                         (__v4di)(__m256i)(b), (int)(p), \
-                                        (__mmask8)-1); })
+                                        (__mmask8)-1)
 
-#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+#define _mm256_mask_cmp_epi64_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
                                         (__v4di)(__m256i)(b), (int)(p), \
-                                        (__mmask8)(m)); })
+                                        (__mmask8)(m))
 
-#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \
+#define _mm256_cmp_epu64_mask(a, b, p) \
   (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
                                          (__v4di)(__m256i)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+#define _mm256_mask_cmp_epu64_mask(m, a, b, p) \
   (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
                                          (__v4di)(__m256i)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-#define _mm256_cmp_ps_mask(a, b, p)  __extension__ ({ \
+#define _mm256_cmp_ps_mask(a, b, p)  \
   (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
                                          (__v8sf)(__m256)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm256_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+#define _mm256_mask_cmp_ps_mask(m, a, b, p)  \
   (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
                                          (__v8sf)(__m256)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-#define _mm256_cmp_pd_mask(a, b, p)  __extension__ ({ \
+#define _mm256_cmp_pd_mask(a, b, p)  \
   (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
                                          (__v4df)(__m256d)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm256_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+#define _mm256_mask_cmp_pd_mask(m, a, b, p)  \
   (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
                                          (__v4df)(__m256d)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-#define _mm_cmp_ps_mask(a, b, p)  __extension__ ({ \
+#define _mm_cmp_ps_mask(a, b, p)  \
   (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
                                          (__v4sf)(__m128)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+#define _mm_mask_cmp_ps_mask(m, a, b, p)  \
   (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
                                          (__v4sf)(__m128)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-#define _mm_cmp_pd_mask(a, b, p)  __extension__ ({ \
+#define _mm_cmp_pd_mask(a, b, p)  \
   (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
                                          (__v2df)(__m128d)(b), (int)(p), \
-                                         (__mmask8)-1); })
+                                         (__mmask8)-1)
 
-#define _mm_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+#define _mm_mask_cmp_pd_mask(m, a, b, p)  \
   (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
                                          (__v2df)(__m128d)(b), (int)(p), \
-                                         (__mmask8)(m)); })
+                                         (__mmask8)(m))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
-                                                    (__v2df) __B,
-                                                    (__v2df) __C,
-                                                    (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df) __C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
-                                                    (__v2df) __B,
-                                                    -(__v2df) __C,
-                                                    (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     -(__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df) __C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
-                                                     (__v2df) __B,
-                                                     -(__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
+                                             (__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
-                                                    (__v4df) __B,
-                                                    (__v4df) __C,
-                                                    (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df) __A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df) __C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
-                                                    (__v4df) __B,
-                                                    -(__v4df) __C,
-                                                    (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df) __A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     -(__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df) __C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
-                                                     (__v4df) __B,
-                                                     -(__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+                                                (__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
-                                                   (__v4sf) __B,
-                                                   (__v4sf) __C,
-                                                   (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf) __A);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf) __C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
-                                                   (__v4sf) __B,
-                                                   -(__v4sf) __C,
-                                                   (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf) __A);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    -(__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf) __C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    -(__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
+                                             (__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   (__v8sf) __C,
-                                                   (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf) __C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   -(__v8sf) __C,
-                                                   (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    -(__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf) __C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    -(__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+                                                (__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
-                                                       (__v2df) __B,
-                                                       (__v2df) __C,
-                                                       (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __C),
+                    (__v2df) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
-                                                        (__v2df) __B,
-                                                        (__v2df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __C),
+                    (__v2df) __C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
-                                                        (__v2df) __B,
-                                                        (__v2df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
-                                                       (__v2df) __B,
-                                                       -(__v2df) __C,
-                                                       (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                -(__v2df) __C),
+                    (__v2df) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
-                                                        (__v2df) __B,
-                                                        -(__v2df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                -(__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
-                                                       (__v4df) __B,
-                                                       (__v4df) __C,
-                                                       (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __C),
+                    (__v4df) __A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
-                                                        (__v4df) __B,
-                                                        (__v4df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __C),
+                    (__v4df) __C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
-                                                        (__v4df) __B,
-                                                        (__v4df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
-                                                       (__v4df) __B,
-                                                       -(__v4df) __C,
-                                                       (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   -(__v4df) __C),
+                    (__v4df) __A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
-                                                        (__v4df) __B,
-                                                        -(__v4df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   -(__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
-                                                      (__v4sf) __B,
-                                                      (__v4sf) __C,
-                                                      (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __C),
+                    (__v4sf) __A);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
-                                                       (__v4sf) __B,
-                                                       (__v4sf) __C,
-                                                       (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __C),
+                    (__v4sf) __C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
-                                                       (__v4sf) __B,
-                                                       (__v4sf) __C,
-                                                       (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
-                                                      (__v4sf) __B,
-                                                      -(__v4sf) __C,
-                                                      (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                -(__v4sf) __C),
+                    (__v4sf) __A);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
-                                                       (__v4sf) __B,
-                                                       -(__v4sf) __C,
-                                                       (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                -(__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
                          __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
-                                                      (__v8sf) __B,
-                                                      (__v8sf) __C,
-                                                      (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C),
+                    (__v8sf) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
-                                                       (__v8sf) __B,
-                                                       (__v8sf) __C,
-                                                       (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C),
+                    (__v8sf) __C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
-                                                       (__v8sf) __B,
-                                                       (__v8sf) __C,
-                                                       (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
-                                                      (__v8sf) __B,
-                                                      -(__v8sf) __C,
-                                                      (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C),
+                    (__v8sf) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
-                                                       (__v8sf) __B,
-                                                       -(__v8sf) __C,
-                                                       (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df) __C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df) __C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf) __C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf) __C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
-                                                        (__v2df) __B,
-                                                        (__v2df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                -(__v2df) __C),
+                    (__v2df) __C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
-                                                        (__v4df) __B,
-                                                        (__v4df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   -(__v4df) __C),
+                    (__v4df) __C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
-                                                       (__v4sf) __B,
-                                                       (__v4sf) __C,
-                                                       (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                -(__v4sf) __C),
+                    (__v4sf) __C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
-                                                       (__v8sf) __B,
-                                                       (__v8sf) __C,
-                                                       (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C),
+                    (__v8sf) __C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             -(__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df) __A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                -(__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df) __A);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             -(__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                -(__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             -(__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
-                                                      (__v2df) __B,
-                                                      (__v2df) __C,
-                                                      (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             -(__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df) __C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                -(__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df) __A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
-                                                      (__v4df) __B,
-                                                      (__v4df) __C,
-                                                      (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                -(__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df) __C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             -(__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf) __A);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
-                                                     (__v4sf) __B,
-                                                     (__v4sf) __C,
-                                                     (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             -(__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf) __C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                -(__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
-                                                     (__v8sf) __B,
-                                                     (__v8sf) __C,
-                                                     (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                -(__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf) __C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df) __W,
-             (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_add_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df)
-             _mm_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_add_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_add_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_add_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf) __W,
-            (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_add_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_add_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_add_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_add_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
   return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
                 (__v4si) __W,
                 (__v4si) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
   return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
                 (__v8si) __W,
                 (__v8si) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
   return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
                  (__v2df) __W,
                  (__v2df) __A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
   return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
                  (__v4df) __W,
                  (__v4df) __A);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
   return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
                 (__v4sf) __W,
                 (__v4sf) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
   return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
                 (__v8sf) __W,
                 (__v8sf) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
   return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
                 (__v2di) __W,
                 (__v2di) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
   return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
                 (__v4di) __W,
                 (__v4di) __A);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
   return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
                   (__v2df) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
   return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
                   (__v2df)
@@ -2033,14 +1657,14 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
   return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
                   (__v4df) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
   return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
                   (__v4df)
@@ -2048,14 +1672,14 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
   return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
                   (__v2di) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
   return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
                   (__v2di)
@@ -2063,14 +1687,14 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
   return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
                   (__v4di) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
   return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
                   (__v4di)
@@ -2078,14 +1702,14 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
   return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
                  (__v4sf) __W,
                  (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
   return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
                  (__v4sf)
@@ -2093,14 +1717,14 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
   return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
                  (__v8sf) __W,
                  (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
   return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
                  (__v8sf)
@@ -2108,14 +1732,14 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
   return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
                   (__v4si) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
   return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
                   (__v4si)
@@ -2123,14 +1747,14 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
   return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
                   (__v8si) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
   return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
                   (__v8si)
@@ -2138,130 +1762,126 @@
                   (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
   __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
             (__v2df) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
   __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
             (__v4df) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
   __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
             (__v2di) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
   __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
             (__v4di) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
   __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
             (__v4sf) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
   __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
             (__v8sf) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
   __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
             (__v4si) __A,
             (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
   __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
             (__v8si) __A,
             (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
-                (__v2df) __W,
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepi32_pd(__A),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepi32_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
-  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
-                (__v4df) __W,
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepi32_pd(__A),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
-  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepi32_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
-  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
-               (__v4sf) __W,
-               (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_cvtepi32_ps(__A),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) {
-  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_cvtepi32_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
-  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
-               (__v8sf) __W,
-               (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_cvtepi32_ps(__A),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) {
-  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
-               (__v8sf)
-               _mm256_setzero_ps (),
-               (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_cvtepi32_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
                 (__v4si) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
                 (__v4si)
@@ -2269,29 +1889,28 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm256_cvtpd_epi32(__A),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm256_cvtpd_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
   return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
             (__v4sf) __W,
             (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
   return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
             (__v4sf)
@@ -2299,22 +1918,21 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
-  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
-               (__v4sf) __W,
-               (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm256_cvtpd_ps(__A),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
-  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm256_cvtpd_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtpd_epu32 (__m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
                  (__v4si)
@@ -2322,14 +1940,14 @@
                  (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
                  (__v4si) __W,
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
                  (__v4si)
@@ -2337,7 +1955,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtpd_epu32 (__m256d __A) {
   return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
                  (__v4si)
@@ -2345,14 +1963,14 @@
                  (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
   return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
                  (__v4si) __W,
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
   return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
                  (__v4si)
@@ -2360,67 +1978,63 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtps_epi32(__A),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtps_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
-  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtps_epi32(__A),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
-  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtps_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
-  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
-                (__v2df) __W,
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_cvtps_pd(__A),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
-  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_cvtps_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
-  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
-                (__v4df) __W,
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_cvtps_pd(__A),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
-  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_cvtps_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtps_epu32 (__m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
                  (__v4si)
@@ -2428,14 +2042,14 @@
                  (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
                  (__v4si) __W,
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
                  (__v4si)
@@ -2443,7 +2057,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtps_epu32 (__m256 __A) {
   return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
                  (__v8si)
@@ -2451,14 +2065,14 @@
                  (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
   return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
                  (__v8si) __W,
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
   return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
                  (__v8si)
@@ -2466,14 +2080,14 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
                  (__v4si) __W,
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
                  (__v4si)
@@ -2481,22 +2095,21 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
-                 (__v4si) __W,
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm256_cvttpd_epi32(__A),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm256_cvttpd_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvttpd_epu32 (__m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
                   (__v4si)
@@ -2504,14 +2117,14 @@
                   (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
                   (__v4si) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
   return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
                   (__v4si)
@@ -2519,7 +2132,7 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvttpd_epu32 (__m256d __A) {
   return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
                   (__v4si)
@@ -2527,14 +2140,14 @@
                   (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
   return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
                   (__v4si) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
   return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
                   (__v4si)
@@ -2542,37 +2155,35 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
-                 (__v4si) __W,
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvttps_epi32(__A),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvttps_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
-  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
-                 (__v8si) __W,
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvttps_epi32(__A),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
-  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvttps_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvttps_epu32 (__m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
                   (__v4si)
@@ -2580,14 +2191,14 @@
                   (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
                   (__v4si) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
   return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
                   (__v4si)
@@ -2595,7 +2206,7 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvttps_epu32 (__m256 __A) {
   return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
                   (__v8si)
@@ -2603,14 +2214,14 @@
                   (__mmask8) -1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
   return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
                   (__v8si) __W,
                   (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
   return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
                   (__v8si)
@@ -2618,175 +2229,147 @@
                   (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_cvtepu32_pd (__m128i __A) {
-  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) -1);
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
-                 (__v2df) __W,
-                 (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepu32_pd(__A),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepu32_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_cvtepu32_pd (__m128i __A) {
-  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
-                 (__v4df)
-                 _mm256_setzero_pd (),
-                 (__mmask8) -1);
+  return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
-  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
-                 (__v4df) __W,
-                 (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepu32_pd(__A),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
-  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
-                 (__v4df)
-                 _mm256_setzero_pd (),
-                 (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepu32_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtepu32_ps (__m128i __A) {
-  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
-                (__v4sf)
-                _mm_setzero_ps (),
-                (__mmask8) -1);
+  return (__m128)__builtin_convertvector((__v4su)__A, __v4sf);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
-  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
-                (__v4sf) __W,
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_cvtepu32_ps(__A),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
-  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
-                (__v4sf)
-                _mm_setzero_ps (),
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_cvtepu32_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_cvtepu32_ps (__m256i __A) {
-  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
-                (__v8sf)
-                _mm256_setzero_ps (),
-                (__mmask8) -1);
+  return (__m256)__builtin_convertvector((__v8su)__A, __v8sf);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
-  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
-                (__v8sf) __W,
-                (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_cvtepu32_ps(__A),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
-  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
-                (__v8sf)
-                _mm256_setzero_ps (),
-                (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_cvtepu32_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_div_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_div_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_div_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_div_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_div_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_div_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_div_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_div_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
   return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
                 (__v2df) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
   return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
                  (__v2df)
@@ -2794,14 +2377,14 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
   return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
                 (__v4df) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
   return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
                  (__v4df)
@@ -2809,14 +2392,14 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
   return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
   return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
                  (__v2di)
@@ -2824,14 +2407,14 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
   return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
   return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
                  (__v4di)
@@ -2839,7 +2422,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
   return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
               (__v2df) __W,
@@ -2847,7 +2430,7 @@
               __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
   return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
                (__v2df)
@@ -2856,7 +2439,7 @@
                __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
   return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
               (__v4df) __W,
@@ -2864,7 +2447,7 @@
               __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
   return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
                (__v4df)
@@ -2873,7 +2456,7 @@
                __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
   return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
               (__v2di) __W,
@@ -2881,7 +2464,7 @@
               __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
   return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
                (__v2di)
@@ -2890,7 +2473,7 @@
                __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
              void const *__P) {
   return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
@@ -2899,7 +2482,7 @@
               __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
   return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
                (__v4di)
@@ -2908,14 +2491,14 @@
                __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
   return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
                    (__v4sf) __W,
                    (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
   return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
               (__v4sf)
@@ -2924,14 +2507,14 @@
               __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
   return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
                    (__v8sf) __W,
                    (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
   return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
               (__v8sf)
@@ -2940,7 +2523,7 @@
               __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
   return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
               (__v4si) __W,
@@ -2948,7 +2531,7 @@
               __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
   return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
                (__v4si)
@@ -2956,7 +2539,7 @@
                (__mmask8)     __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
              void const *__P) {
   return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
@@ -2965,7 +2548,7 @@
               __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
   return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
                (__v8si)
@@ -2974,14 +2557,14 @@
                __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
   return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
                (__v4sf) __W,
                (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
   return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
                 (__v4sf)
@@ -2989,14 +2572,14 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
   return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
                (__v8sf) __W,
                (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
   return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
                 (__v8sf)
@@ -3004,14 +2587,14 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
   return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
                 (__v4si) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
   return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
                  (__v4si)
@@ -3019,14 +2602,14 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
   return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
                 (__v8si) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
   return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
                  (__v8si)
@@ -3034,7 +2617,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_getexp_pd (__m128d __A) {
   return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
                 (__v2df)
@@ -3042,14 +2625,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
   return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
                 (__v2df) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
   return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
                 (__v2df)
@@ -3057,7 +2640,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_getexp_pd (__m256d __A) {
   return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
                 (__v4df)
@@ -3065,14 +2648,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
   return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
                 (__v4df) __W,
                 (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
   return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
                 (__v4df)
@@ -3080,7 +2663,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_getexp_ps (__m128 __A) {
   return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
                (__v4sf)
@@ -3088,14 +2671,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
   return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
                (__v4sf) __W,
                (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
   return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
                (__v4sf)
@@ -3103,7 +2686,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_getexp_ps (__m256 __A) {
   return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
                (__v8sf)
@@ -3111,14 +2694,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
   return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
                (__v8sf) __W,
                (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
   return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
                (__v8sf)
@@ -3126,708 +2709,579 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_max_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_max_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_max_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_max_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_max_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_max_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_max_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_max_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_min_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_min_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_min_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_min_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_min_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_min_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_min_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_min_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_mul_pd(__A, __B),
+                                              (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_mul_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_mul_pd(__A, __B),
+                                              (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_mul_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_mul_ps(__A, __B),
+                                             (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_mul_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_mul_ps(__A, __B),
+                                             (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_mul_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
-             (__v4si) __W,
-             (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_abs_epi32(__A),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_abs_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
-             (__v8si) __W,
-             (__mmask8) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_abs_epi32(__A),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_abs_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_abs_epi64 (__m128i __A) {
-  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
-             (__v2di)
-             _mm_setzero_si128 (),
-             (__mmask8) -1);
+  return (__m128i)__builtin_ia32_pabsq128((__v2di)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_abs_epi64(__A),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
-             (__v2di)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_abs_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi64 (__m256i __A) {
-  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) -1);
+  return (__m256i)__builtin_ia32_pabsq256 ((__v4di)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_abs_epi64(__A),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_abs_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_max_epi64 (__m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_pmaxsq128((__v2di)__A, (__v2di)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_max_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W, __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_max_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epi64 (__m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_pmaxsq256((__v4di)__A, (__v4di)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_max_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_max_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epu32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epu32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epu32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epu32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_max_epu64 (__m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_pmaxuq128((__v2di)__A, (__v2di)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W, __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_max_epu64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_max_epu64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epu64 (__m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_pmaxuq256((__v4di)__A, (__v4di)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W, __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_max_epu64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_max_epu64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_min_epi64 (__m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_pminsq128((__v2di)__A, (__v2di)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W, __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_min_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_min_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epi64 (__m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_pminsq256((__v4di)__A, (__v4di)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W, __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_min_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_min_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epu32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W, __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epu32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epu32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W, __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epu32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_min_epu64 (__m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_pminuq128((__v2di)__A, (__v2di)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
-        __m128i __B) {
-  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W, __M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_min_epu64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_si128 (),
-              __M);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_min_epu64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epu64 (__m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_pminuq256((__v4di)__A, (__v4di)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
-           __m256i __B) {
-  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W, __M);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_min_epu64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_min_epu64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-#define _mm_roundscale_pd(A, imm) __extension__ ({ \
+#define _mm_roundscale_pd(A, imm) \
   (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
                                               (int)(imm), \
                                               (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1); })
+                                              (__mmask8)-1)
 
 
-#define _mm_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \
+#define _mm_mask_roundscale_pd(W, U, A, imm) \
   (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
                                               (int)(imm), \
                                               (__v2df)(__m128d)(W), \
-                                              (__mmask8)(U)); })
+                                              (__mmask8)(U))
 
 
-#define _mm_maskz_roundscale_pd(U, A, imm) __extension__ ({ \
+#define _mm_maskz_roundscale_pd(U, A, imm) \
   (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
                                               (int)(imm), \
                                               (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(U)); })
+                                              (__mmask8)(U))
 
 
-#define _mm256_roundscale_pd(A, imm) __extension__ ({ \
+#define _mm256_roundscale_pd(A, imm) \
   (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
                                               (int)(imm), \
                                               (__v4df)_mm256_setzero_pd(), \
-                                              (__mmask8)-1); })
+                                              (__mmask8)-1)
 
 
-#define _mm256_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \
+#define _mm256_mask_roundscale_pd(W, U, A, imm) \
   (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
                                               (int)(imm), \
                                               (__v4df)(__m256d)(W), \
-                                              (__mmask8)(U)); })
+                                              (__mmask8)(U))
 
 
-#define _mm256_maskz_roundscale_pd(U, A, imm)  __extension__ ({ \
+#define _mm256_maskz_roundscale_pd(U, A, imm)  \
   (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
                                               (int)(imm), \
                                               (__v4df)_mm256_setzero_pd(), \
-                                              (__mmask8)(U)); })
+                                              (__mmask8)(U))
 
-#define _mm_roundscale_ps(A, imm)  __extension__ ({ \
+#define _mm_roundscale_ps(A, imm)  \
   (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
                                              (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
 
-#define _mm_mask_roundscale_ps(W, U, A, imm)  __extension__ ({ \
+#define _mm_mask_roundscale_ps(W, U, A, imm)  \
   (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
                                              (__v4sf)(__m128)(W), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
 
-#define _mm_maskz_roundscale_ps(U, A, imm)  __extension__ ({ \
+#define _mm_maskz_roundscale_ps(U, A, imm)  \
   (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
                                              (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm256_roundscale_ps(A, imm)  __extension__ ({ \
+#define _mm256_roundscale_ps(A, imm)  \
   (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
                                              (__v8sf)_mm256_setzero_ps(), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
-#define _mm256_mask_roundscale_ps(W, U, A, imm)  __extension__ ({ \
+#define _mm256_mask_roundscale_ps(W, U, A, imm)  \
   (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
                                              (__v8sf)(__m256)(W), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
 
-#define _mm256_maskz_roundscale_ps(U, A, imm)  __extension__ ({ \
+#define _mm256_maskz_roundscale_ps(U, A, imm)  \
   (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
                                              (__v8sf)_mm256_setzero_ps(), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_scalef_pd (__m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
                 (__v2df) __B,
@@ -3836,7 +3290,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
         __m128d __B) {
   return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
@@ -3845,7 +3299,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
                 (__v2df) __B,
@@ -3854,7 +3308,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_scalef_pd (__m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
                 (__v4df) __B,
@@ -3863,7 +3317,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
            __m256d __B) {
   return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
@@ -3872,7 +3326,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
                 (__v4df) __B,
@@ -3881,7 +3335,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_scalef_ps (__m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
                (__v4sf) __B,
@@ -3890,7 +3344,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
                (__v4sf) __B,
@@ -3898,7 +3352,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
                (__v4sf) __B,
@@ -3907,7 +3361,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_scalef_ps (__m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
                (__v8sf) __B,
@@ -3916,7 +3370,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
            __m256 __B) {
   return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
@@ -3925,7 +3379,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
                (__v8sf) __B,
@@ -3934,1822 +3388,1575 @@
                (__mmask8) __U);
 }
 
-#define _mm_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \
+#define _mm_i64scatter_pd(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)-1, \
                                (__v2di)(__m128i)(index), \
-                               (__v2df)(__m128d)(v1), (int)(scale)); })
+                               (__v2df)(__m128d)(v1), (int)(scale))
 
-#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)(mask), \
                                (__v2di)(__m128i)(index), \
-                               (__v2df)(__m128d)(v1), (int)(scale)); })
+                               (__v2df)(__m128d)(v1), (int)(scale))
 
-#define _mm_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+#define _mm_i64scatter_epi64(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)-1, \
                                (__v2di)(__m128i)(index), \
-                               (__v2di)(__m128i)(v1), (int)(scale)); })
+                               (__v2di)(__m128i)(v1), (int)(scale))
 
-#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)(mask), \
                                (__v2di)(__m128i)(index), \
-                               (__v2di)(__m128i)(v1), (int)(scale)); })
+                               (__v2di)(__m128i)(v1), (int)(scale))
 
-#define _mm256_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \
+#define _mm256_i64scatter_pd(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)-1, \
                                (__v4di)(__m256i)(index), \
-                               (__v4df)(__m256d)(v1), (int)(scale)); })
+                               (__v4df)(__m256d)(v1), (int)(scale))
 
-#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)(mask), \
                                (__v4di)(__m256i)(index), \
-                               (__v4df)(__m256d)(v1), (int)(scale)); })
+                               (__v4df)(__m256d)(v1), (int)(scale))
 
-#define _mm256_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+#define _mm256_i64scatter_epi64(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)-1, \
                                (__v4di)(__m256i)(index), \
-                               (__v4di)(__m256i)(v1), (int)(scale)); })
+                               (__v4di)(__m256i)(v1), (int)(scale))
 
-#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)(mask), \
                                (__v4di)(__m256i)(index), \
-                               (__v4di)(__m256i)(v1), (int)(scale)); })
+                               (__v4di)(__m256i)(v1), (int)(scale))
 
-#define _mm_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \
+#define _mm_i64scatter_ps(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)-1, \
                                (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale)); })
+                               (int)(scale))
 
-#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)(mask), \
                                (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale)); })
+                               (int)(scale))
 
-#define _mm_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+#define _mm_i64scatter_epi32(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)-1, \
                                (__v2di)(__m128i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale)); })
+                               (__v4si)(__m128i)(v1), (int)(scale))
 
-#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)(mask), \
                                (__v2di)(__m128i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale)); })
+                               (__v4si)(__m128i)(v1), (int)(scale))
 
-#define _mm256_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \
+#define _mm256_i64scatter_ps(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)-1, \
                                (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale)); })
+                               (int)(scale))
 
-#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)(mask), \
                                (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale)); })
+                               (int)(scale))
 
-#define _mm256_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+#define _mm256_i64scatter_epi32(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)-1, \
                                (__v4di)(__m256i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale)); })
+                               (__v4si)(__m128i)(v1), (int)(scale))
 
-#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({  \
+#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
   __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)(mask), \
                                (__v4di)(__m256i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale)); })
+                               (__v4si)(__m128i)(v1), (int)(scale))
 
-#define _mm_i32scatter_pd(addr, index, v1, scale) __extension__ ({      \
+#define _mm_i32scatter_pd(addr, index, v1, scale) \
   __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)-1, \
                                (__v4si)(__m128i)(index), \
-                               (__v2df)(__m128d)(v1), (int)(scale)); })
+                               (__v2df)(__m128d)(v1), (int)(scale))
 
-#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({        \
-  __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)(mask), \
-                               (__v4si)(__m128i)(index), \
-                               (__v2df)(__m128d)(v1), (int)(scale)); })
+#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \
+    __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)(mask), \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v2df)(__m128d)(v1), (int)(scale))
 
-#define _mm_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)-1, \
-                               (__v4si)(__m128i)(index), \
-                               (__v2di)(__m128i)(v1), (int)(scale)); })
+#define _mm_i32scatter_epi64(addr, index, v1, scale) \
+    __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)-1, \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v2di)(__m128i)(v1), (int)(scale))
 
-#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)(mask), \
-                               (__v4si)(__m128i)(index), \
-                               (__v2di)(__m128i)(v1), (int)(scale)); })
+#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
+    __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)(mask), \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v2di)(__m128i)(v1), (int)(scale))
 
-#define _mm256_i32scatter_pd(addr, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)-1, \
-                               (__v4si)(__m128i)(index), \
-                               (__v4df)(__m256d)(v1), (int)(scale)); })
+#define _mm256_i32scatter_pd(addr, index, v1, scale) \
+    __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)-1, \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v4df)(__m256d)(v1), (int)(scale))
 
-#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)(mask), \
-                               (__v4si)(__m128i)(index), \
-                               (__v4df)(__m256d)(v1), (int)(scale)); })
+#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \
+    __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)(mask), \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v4df)(__m256d)(v1), (int)(scale))
 
-#define _mm256_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)-1, \
-                               (__v4si)(__m128i)(index), \
-                               (__v4di)(__m256i)(v1), (int)(scale)); })
+#define _mm256_i32scatter_epi64(addr, index, v1, scale) \
+    __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)-1, \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v4di)(__m256i)(v1), (int)(scale))
 
-#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)(mask), \
-                               (__v4si)(__m128i)(index), \
-                               (__v4di)(__m256i)(v1), (int)(scale)); })
+#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
+    __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)(mask), \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v4di)(__m256i)(v1), (int)(scale))
 
-#define _mm_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)-1, \
-                               (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale)); })
+#define _mm_i32scatter_ps(addr, index, v1, scale) \
+    __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)-1, \
+                                 (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                                 (int)(scale))
 
-#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)(mask), \
-                               (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale)); })
+#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \
+    __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)(mask), \
+                                 (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                                 (int)(scale))
 
-#define _mm_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)-1, \
-                               (__v4si)(__m128i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale)); })
+#define _mm_i32scatter_epi32(addr, index, v1, scale) \
+    __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)-1, \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v4si)(__m128i)(v1), (int)(scale))
 
-#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)(mask), \
-                               (__v4si)(__m128i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale)); })
+#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
+    __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)(mask), \
+                                 (__v4si)(__m128i)(index), \
+                                 (__v4si)(__m128i)(v1), (int)(scale))
 
-#define _mm256_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)-1, \
-                               (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
-                               (int)(scale)); })
+#define _mm256_i32scatter_ps(addr, index, v1, scale) \
+    __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)-1, \
+                                 (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+                                 (int)(scale))
 
-#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)(mask), \
-                               (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
-                               (int)(scale)); })
+#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \
+    __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)(mask), \
+                                 (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+                                 (int)(scale))
 
-#define _mm256_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)-1, \
-                               (__v8si)(__m256i)(index), \
-                               (__v8si)(__m256i)(v1), (int)(scale)); })
+#define _mm256_i32scatter_epi32(addr, index, v1, scale) \
+    __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)-1, \
+                                 (__v8si)(__m256i)(index), \
+                                 (__v8si)(__m256i)(v1), (int)(scale))
 
-#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
-  __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)(mask), \
-                               (__v8si)(__m256i)(index), \
-                               (__v8si)(__m256i)(v1), (int)(scale)); })
+#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
+    __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)(mask), \
+                                 (__v8si)(__m256i)(index), \
+                                 (__v8si)(__m256i)(v1), (int)(scale))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
-              (__v2df) __W,
-              (__mmask8) __U);
-}
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+    return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                                (__v2df)_mm_sqrt_pd(__A),
+                                                (__v2df)__W);
+  }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
-              (__v2df)
-              _mm_setzero_pd (),
-              (__mmask8) __U);
-}
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
+    return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                                (__v2df)_mm_sqrt_pd(__A),
+                                                (__v2df)_mm_setzero_pd());
+  }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
-              (__v4df) __W,
-              (__mmask8) __U);
-}
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+    return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                                (__v4df)_mm256_sqrt_pd(__A),
+                                                (__v4df)__W);
+  }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
-              (__v4df)
-              _mm256_setzero_pd (),
-              (__mmask8) __U);
-}
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
+    return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                                (__v4df)_mm256_sqrt_pd(__A),
+                                                (__v4df)_mm256_setzero_pd());
+  }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
-             (__v4sf) __W,
-             (__mmask8) __U);
-}
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+    return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                               (__v4sf)_mm_sqrt_ps(__A),
+                                               (__v4sf)__W);
+  }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
-             (__v4sf)
-             _mm_setzero_ps (),
-             (__mmask8) __U);
-}
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
+    return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                               (__v4sf)_mm_sqrt_ps(__A),
+                                               (__v4sf)_mm_setzero_ps());
+  }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
-             (__v8sf) __W,
-             (__mmask8) __U);
-}
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+    return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                               (__v8sf)_mm256_sqrt_ps(__A),
+                                               (__v8sf)__W);
+  }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
-             (__v8sf)
-             _mm256_setzero_ps (),
-             (__mmask8) __U);
-}
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
+    return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                               (__v8sf)_mm256_sqrt_ps(__A),
+                                               (__v8sf)_mm256_setzero_ps());
+  }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df) __W,
-             (__mmask8) __U);
-}
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+    return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                                (__v2df)_mm_sub_pd(__A, __B),
+                                                (__v2df)__W);
+  }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
-             (__v2df) __B,
-             (__v2df)
-             _mm_setzero_pd (),
-             (__mmask8) __U);
-}
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+    return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                                (__v2df)_mm_sub_pd(__A, __B),
+                                                (__v2df)_mm_setzero_pd());
+  }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A,
-        __m256d __B) {
-  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df) __W,
-             (__mmask8) __U);
-}
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+    return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                                (__v4df)_mm256_sub_pd(__A, __B),
+                                                (__v4df)__W);
+  }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
-             (__v4df) __B,
-             (__v4df)
-             _mm256_setzero_pd (),
-             (__mmask8) __U);
-}
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+    return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                                (__v4df)_mm256_sub_pd(__A, __B),
+                                                (__v4df)_mm256_setzero_pd());
+  }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf) __W,
-            (__mmask8) __U);
-}
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+    return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                               (__v4sf)_mm_sub_ps(__A, __B),
+                                               (__v4sf)__W);
+  }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
-            (__v4sf) __B,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
-}
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+    return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                               (__v4sf)_mm_sub_ps(__A, __B),
+                                               (__v4sf)_mm_setzero_ps());
+  }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf) __W,
-            (__mmask8) __U);
-}
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+    return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                               (__v8sf)_mm256_sub_ps(__A, __B),
+                                               (__v8sf)__W);
+  }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
-            (__v8sf) __B,
-            (__v8sf)
-            _mm256_setzero_ps (),
-            (__mmask8) __U);
-}
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+    return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                               (__v8sf)_mm256_sub_ps(__A, __B),
+                                               (__v8sf)_mm256_setzero_ps());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U,
-            __m128i __B) {
-  return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A,
-                   (__v4si) __I
-                   /* idx */ ,
-                   (__v4si) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
+    return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
+                                                  (__v4si)__B);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I,
-         __mmask8 __U, __m256i __B) {
-  return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A,
-                   (__v8si) __I
-                   /* idx */ ,
-                   (__v8si) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I,
+                              __m128i __B) {
+    return (__m128i)__builtin_ia32_selectd_128(__U,
+                                    (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
+                                    (__v4si)__A);
+  }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U,
-         __m128d __B) {
-  return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A,
-              (__v2di) __I
-              /* idx */ ,
-              (__v2df) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
+                               __m128i __B) {
+    return (__m128i)__builtin_ia32_selectd_128(__U,
+                                    (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
+                                    (__v4si)__I);
+  }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U,
-            __m256d __B) {
-  return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A,
-              (__v4di) __I
-              /* idx */ ,
-              (__v4df) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,
+                               __m128i __B) {
+    return (__m128i)__builtin_ia32_selectd_128(__U,
+                                    (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
+                                    (__v4si)_mm_setzero_si128());
+  }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U,
-         __m128 __B) {
-  return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A,
-                   (__v4si) __I
-                   /* idx */ ,
-                   (__v4sf) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
+    return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
+                                                  (__v8si) __B);
+  }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U,
-            __m256 __B) {
-  return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A,
-                   (__v8si) __I
-                   /* idx */ ,
-                   (__v8sf) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I,
+                                 __m256i __B) {
+    return (__m256i)__builtin_ia32_selectd_256(__U,
+                                 (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
+                                 (__v8si)__A);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U,
-            __m128i __B) {
-  return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A,
-                   (__v2di) __I
-                   /* idx */ ,
-                   (__v2di) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U,
+                                  __m256i __B) {
+    return (__m256i)__builtin_ia32_selectd_256(__U,
+                                 (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
+                                 (__v8si)__I);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I,
-         __mmask8 __U, __m256i __B) {
-  return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A,
-                   (__v4di) __I
-                   /* idx */ ,
-                   (__v4di) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I,
+                                  __m256i __B) {
+    return (__m256i)__builtin_ia32_selectd_256(__U,
+                                 (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
+                                 (__v8si)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) {
-  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
-                   /* idx */ ,
-                   (__v4si) __A,
-                   (__v4si) __B,
-                   (__mmask8) -1);
-}
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
+    return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
+                                                   (__v2df)__B);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I,
-           __m128i __B) {
-  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
-                   /* idx */ ,
-                   (__v4si) __A,
-                   (__v4si) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
+    return (__m128d)__builtin_ia32_selectpd_128(__U,
+                                       (__v2df)_mm_permutex2var_pd(__A, __I, __B),
+                                       (__v2df)__A);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I,
-            __m128i __B) {
-  return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I
-              /* idx */ ,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
+    return (__m128d)__builtin_ia32_selectpd_128(__U,
+                                       (__v2df)_mm_permutex2var_pd(__A, __I, __B),
+                                       (__v2df)(__m128d)__I);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) {
-  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
-                   /* idx */ ,
-                   (__v8si) __A,
-                   (__v8si) __B,
-                   (__mmask8) -1);
-}
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
+    return (__m128d)__builtin_ia32_selectpd_128(__U,
+                                       (__v2df)_mm_permutex2var_pd(__A, __I, __B),
+                                       (__v2df)_mm_setzero_pd());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I,
-        __m256i __B) {
-  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
-                   /* idx */ ,
-                   (__v8si) __A,
-                   (__v8si) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
+    return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
+                                                   (__v4df)__B);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A,
-         __m256i __I, __m256i __B) {
-  return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I
-              /* idx */ ,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I,
+                              __m256d __B) {
+    return (__m256d)__builtin_ia32_selectpd_256(__U,
+                                    (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
+                                    (__v4df)__A);
+  }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) {
-  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
-              /* idx */ ,
-              (__v2df) __A,
-              (__v2df) __B,
-              (__mmask8) -
-              1);
-}
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,
+                               __m256d __B) {
+    return (__m256d)__builtin_ia32_selectpd_256(__U,
+                                    (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
+                                    (__v4df)(__m256d)__I);
+  }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I,
-        __m128d __B) {
-  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
-              /* idx */ ,
-              (__v2df) __A,
-              (__v2df) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,
+                               __m256d __B) {
+    return (__m256d)__builtin_ia32_selectpd_256(__U,
+                                    (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
+                                    (__v4df)_mm256_setzero_pd());
+  }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I,
-         __m128d __B) {
-  return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I
-               /* idx */ ,
-               (__v2df) __A,
-               (__v2df) __B,
-               (__mmask8)
-               __U);
-}
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
+    return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
+                                                  (__v4sf)__B);
+  }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) {
-  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
-              /* idx */ ,
-              (__v4df) __A,
-              (__v4df) __B,
-              (__mmask8) -
-              1);
-}
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
+    return (__m128)__builtin_ia32_selectps_128(__U,
+                                       (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
+                                       (__v4sf)__A);
+  }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I,
-           __m256d __B) {
-  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
-              /* idx */ ,
-              (__v4df) __A,
-              (__v4df) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
+    return (__m128)__builtin_ia32_selectps_128(__U,
+                                       (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
+                                       (__v4sf)(__m128)__I);
+  }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I,
-            __m256d __B) {
-  return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I
-               /* idx */ ,
-               (__v4df) __A,
-               (__v4df) __B,
-               (__mmask8)
-               __U);
-}
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
+    return (__m128)__builtin_ia32_selectps_128(__U,
+                                       (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
+                                       (__v4sf)_mm_setzero_ps());
+  }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) {
-  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
-                   /* idx */ ,
-                   (__v4sf) __A,
-                   (__v4sf) __B,
-                   (__mmask8) -1);
-}
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
+    return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
+                                                  (__v8sf) __B);
+  }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I,
-        __m128 __B) {
-  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
-                   /* idx */ ,
-                   (__v4sf) __A,
-                   (__v4sf) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
+    return (__m256)__builtin_ia32_selectps_256(__U,
+                                    (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
+                                    (__v8sf)__A);
+  }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I,
-         __m128 __B) {
-  return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I
-              /* idx */ ,
-              (__v4sf) __A,
-              (__v4sf) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,
+                               __m256 __B) {
+    return (__m256)__builtin_ia32_selectps_256(__U,
+                                    (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
+                                    (__v8sf)(__m256)__I);
+  }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) {
-  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
-                   /* idx */ ,
-                   (__v8sf) __A,
-                   (__v8sf) __B,
-                   (__mmask8) -1);
-}
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I,
+                               __m256 __B) {
+    return (__m256)__builtin_ia32_selectps_256(__U,
+                                    (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
+                                    (__v8sf)_mm256_setzero_ps());
+  }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I,
-           __m256 __B) {
-  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
-                   /* idx */ ,
-                   (__v8sf) __A,
-                   (__v8sf) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
+    return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
+                                                  (__v2di)__B);
+  }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I,
-            __m256 __B) {
-  return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I
-              /* idx */ ,
-              (__v8sf) __A,
-              (__v8sf) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I,
+                              __m128i __B) {
+    return (__m128i)__builtin_ia32_selectq_128(__U,
+                                    (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
+                                    (__v2di)__A);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) {
-  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
-                   /* idx */ ,
-                   (__v2di) __A,
-                   (__v2di) __B,
-                   (__mmask8) -1);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
+                               __m128i __B) {
+    return (__m128i)__builtin_ia32_selectq_128(__U,
+                                    (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
+                                    (__v2di)__I);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I,
-           __m128i __B) {
-  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
-                   /* idx */ ,
-                   (__v2di) __A,
-                   (__v2di) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I,
+                               __m128i __B) {
+    return (__m128i)__builtin_ia32_selectq_128(__U,
+                                    (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
+                                    (__v2di)_mm_setzero_si128());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I,
-            __m128i __B) {
-  return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I
-              /* idx */ ,
-              (__v2di) __A,
-              (__v2di) __B,
-              (__mmask8)
-              __U);
-}
 
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
+    return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
+                                                  (__v4di) __B);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) {
-  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
-                   /* idx */ ,
-                   (__v4di) __A,
-                   (__v4di) __B,
-                   (__mmask8) -1);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I,
+                                 __m256i __B) {
+    return (__m256i)__builtin_ia32_selectq_256(__U,
+                                 (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
+                                 (__v4di)__A);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I,
-        __m256i __B) {
-  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
-                   /* idx */ ,
-                   (__v4di) __A,
-                   (__v4di) __B,
-                   (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U,
+                                  __m256i __B) {
+    return (__m256i)__builtin_ia32_selectq_256(__U,
+                                 (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
+                                 (__v4di)__I);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
-         __m256i __I, __m256i __B) {
-  return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I
-              /* idx */ ,
-              (__v4di) __A,
-              (__v4di) __B,
-              (__mmask8)
-              __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I,
+                                  __m256i __B) {
+    return (__m256i)__builtin_ia32_selectq_256(__U,
+                                 (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
+                                 (__v4di)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                               (__v4si)_mm_cvtepi8_epi32(__A),
+                                               (__v4si)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                               (__v4si)_mm_cvtepi8_epi32(__A),
+                                               (__v4si)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                               (__v8si)_mm256_cvtepi8_epi32(__A),
+                                               (__v8si)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                               (__v8si)_mm256_cvtepi8_epi32(__A),
+                                               (__v8si)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepi8_epi64(__A),
+                                               (__v2di)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepi8_epi64(__A),
+                                               (__v2di)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepi8_epi64(__A),
+                                               (__v4di)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepi8_epi64(__A),
+                                               (__v4di)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
-{
-  return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepi32_epi64(__X),
+                                               (__v2di)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
-{
-  return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepi32_epi64(__X),
+                                               (__v2di)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
-{
-  return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepi32_epi64(__X),
+                                               (__v4di)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
-{
-  return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepi32_epi64(__X),
+                                               (__v4di)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                               (__v4si)_mm_cvtepi16_epi32(__A),
+                                               (__v4si)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                               (__v4si)_mm_cvtepi16_epi32(__A),
+                                               (__v4si)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                               (__v8si)_mm256_cvtepi16_epi32(__A),
+                                               (__v8si)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                               (__v8si)_mm256_cvtepi16_epi32(__A),
+                                               (__v8si)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepi16_epi64(__A),
+                                               (__v2di)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepi16_epi64(__A),
+                                               (__v2di)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepi16_epi64(__A),
+                                               (__v4di)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepi16_epi64(__A),
+                                               (__v4di)_mm256_setzero_si256());
+  }
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                               (__v4si)_mm_cvtepu8_epi32(__A),
+                                               (__v4si)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                               (__v4si)_mm_cvtepu8_epi32(__A),
+                                               (__v4si)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                               (__v8si)_mm256_cvtepu8_epi32(__A),
+                                               (__v8si)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                               (__v8si)_mm256_cvtepu8_epi32(__A),
+                                               (__v8si)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepu8_epi64(__A),
+                                               (__v2di)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepu8_epi64(__A),
+                                               (__v2di)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepu8_epi64(__A),
+                                               (__v4di)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepu8_epi64(__A),
+                                               (__v4di)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
-{
-  return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepu32_epi64(__X),
+                                               (__v2di)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
-{
-  return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepu32_epi64(__X),
+                                               (__v2di)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
-{
-  return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepu32_epi64(__X),
+                                               (__v4di)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
-{
-  return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepu32_epi64(__X),
+                                               (__v4di)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                               (__v4si)_mm_cvtepu16_epi32(__A),
+                                               (__v4si)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                               (__v4si)_mm_cvtepu16_epi32(__A),
+                                               (__v4si)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                               (__v8si)_mm256_cvtepu16_epi32(__A),
+                                               (__v8si)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
-                (__v8si)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                               (__v8si)_mm256_cvtepu16_epi32(__A),
+                                               (__v8si)_mm256_setzero_si256());
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepu16_epi64(__A),
+                                               (__v2di)__W);
+  }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
-                (__v2di)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+  {
+    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                               (__v2di)_mm_cvtepu16_epi64(__A),
+                                               (__v2di)_mm_setzero_si128());
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepu16_epi64(__A),
+                                               (__v4di)__W);
+  }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
-                (__v4di)
-                _mm256_setzero_si256 (),
-                (__mmask8) __U);
-}
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+  {
+    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                               (__v4di)_mm256_cvtepu16_epi64(__A),
+                                               (__v4di)_mm256_setzero_si256());
+  }
 
 
-#define _mm_rol_epi32(a, b) __extension__ ({\
-  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
-                                        (__v4si)_mm_setzero_si128(), \
-                                        (__mmask8)-1); })
+#define _mm_rol_epi32(a, b) \
+  (__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b))
 
-#define _mm_mask_rol_epi32(w, u, a, b) __extension__ ({\
-  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
-                                        (__v4si)(__m128i)(w), (__mmask8)(u)); })
+#define _mm_mask_rol_epi32(w, u, a, b) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+                                      (__v4si)_mm_rol_epi32((a), (b)), \
+                                      (__v4si)(__m128i)(w))
 
-#define _mm_maskz_rol_epi32(u, a, b) __extension__ ({\
-  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
-                                        (__v4si)_mm_setzero_si128(), \
-                                        (__mmask8)(u)); })
+#define _mm_maskz_rol_epi32(u, a, b) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+                                      (__v4si)_mm_rol_epi32((a), (b)), \
+                                      (__v4si)_mm_setzero_si128())
 
-#define _mm256_rol_epi32(a, b) __extension__ ({\
-  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
-                                        (__v8si)_mm256_setzero_si256(), \
-                                        (__mmask8)-1); })
+#define _mm256_rol_epi32(a, b) \
+  (__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b))
 
-#define _mm256_mask_rol_epi32(w, u, a, b) __extension__ ({\
-  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
-                                        (__v8si)(__m256i)(w), (__mmask8)(u)); })
+#define _mm256_mask_rol_epi32(w, u, a, b) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+                                      (__v8si)_mm256_rol_epi32((a), (b)), \
+                                      (__v8si)(__m256i)(w))
 
-#define _mm256_maskz_rol_epi32(u, a, b) __extension__ ({\
-  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
-                                        (__v8si)_mm256_setzero_si256(), \
-                                        (__mmask8)(u)); })
+#define _mm256_maskz_rol_epi32(u, a, b) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+                                      (__v8si)_mm256_rol_epi32((a), (b)), \
+                                      (__v8si)_mm256_setzero_si256())
 
-#define _mm_rol_epi64(a, b) __extension__ ({\
-  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
-                                        (__v2di)_mm_setzero_di(), \
-                                        (__mmask8)-1); })
+#define _mm_rol_epi64(a, b) \
+  (__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b))
 
-#define _mm_mask_rol_epi64(w, u, a, b) __extension__ ({\
-  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
-                                        (__v2di)(__m128i)(w), (__mmask8)(u)); })
+#define _mm_mask_rol_epi64(w, u, a, b) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+                                      (__v2di)_mm_rol_epi64((a), (b)), \
+                                      (__v2di)(__m128i)(w))
 
-#define _mm_maskz_rol_epi64(u, a, b) __extension__ ({\
-  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
-                                        (__v2di)_mm_setzero_di(), \
-                                        (__mmask8)(u)); })
+#define _mm_maskz_rol_epi64(u, a, b) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+                                      (__v2di)_mm_rol_epi64((a), (b)), \
+                                      (__v2di)_mm_setzero_si128())
 
-#define _mm256_rol_epi64(a, b) __extension__ ({\
-  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
-                                        (__v4di)_mm256_setzero_si256(), \
-                                        (__mmask8)-1); })
+#define _mm256_rol_epi64(a, b) \
+  (__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b))
 
-#define _mm256_mask_rol_epi64(w, u, a, b) __extension__ ({\
-  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
-                                        (__v4di)(__m256i)(w), (__mmask8)(u)); })
+#define _mm256_mask_rol_epi64(w, u, a, b) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+                                      (__v4di)_mm256_rol_epi64((a), (b)), \
+                                      (__v4di)(__m256i)(w))
 
-#define _mm256_maskz_rol_epi64(u, a, b) __extension__ ({\
-  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
-                                        (__v4di)_mm256_setzero_si256(), \
-                                        (__mmask8)(u)); })
+#define _mm256_maskz_rol_epi64(u, a, b) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+                                      (__v4di)_mm256_rol_epi64((a), (b)), \
+                                      (__v4di)_mm256_setzero_si256())
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_rolv_epi32 (__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                             (__v4si)_mm_rolv_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                             (__v4si)_mm_rolv_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_rolv_epi32 (__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                            (__v8si)_mm256_rolv_epi32(__A, __B),
+                                            (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                            (__v8si)_mm256_rolv_epi32(__A, __B),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_rolv_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_di (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128(__U,
+                                             (__v2di)_mm_rolv_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_di (),
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128(__U,
+                                             (__v2di)_mm_rolv_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_rolv_epi64 (__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256(__U,
+                                            (__v4di)_mm256_rolv_epi64(__A, __B),
+                                            (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256(__U,
+                                            (__v4di)_mm256_rolv_epi64(__A, __B),
+                                            (__v4di)_mm256_setzero_si256());
 }
 
-#define _mm_ror_epi32(A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
-                                        (__v4si)_mm_setzero_si128(), \
-                                        (__mmask8)-1); })
+#define _mm_ror_epi32(a, b) \
+  (__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b))
 
-#define _mm_mask_ror_epi32(W, U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
-                                        (__v4si)(__m128i)(W), (__mmask8)(U)); })
+#define _mm_mask_ror_epi32(w, u, a, b) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+                                      (__v4si)_mm_ror_epi32((a), (b)), \
+                                      (__v4si)(__m128i)(w))
 
-#define _mm_maskz_ror_epi32(U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
-                                        (__v4si)_mm_setzero_si128(), \
-                                        (__mmask8)(U)); })
+#define _mm_maskz_ror_epi32(u, a, b) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+                                      (__v4si)_mm_ror_epi32((a), (b)), \
+                                      (__v4si)_mm_setzero_si128())
 
-#define _mm256_ror_epi32(A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
-                                        (__v8si)_mm256_setzero_si256(), \
-                                        (__mmask8)-1); })
+#define _mm256_ror_epi32(a, b) \
+  (__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b))
 
-#define _mm256_mask_ror_epi32(W, U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
-                                        (__v8si)(__m256i)(W), (__mmask8)(U)); })
+#define _mm256_mask_ror_epi32(w, u, a, b) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+                                      (__v8si)_mm256_ror_epi32((a), (b)), \
+                                      (__v8si)(__m256i)(w))
 
-#define _mm256_maskz_ror_epi32(U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
-                                        (__v8si)_mm256_setzero_si256(), \
-                                        (__mmask8)(U)); })
+#define _mm256_maskz_ror_epi32(u, a, b) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+                                      (__v8si)_mm256_ror_epi32((a), (b)), \
+                                      (__v8si)_mm256_setzero_si256())
 
-#define _mm_ror_epi64(A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
-                                        (__v2di)_mm_setzero_di(), \
-                                        (__mmask8)-1); })
+#define _mm_ror_epi64(a, b) \
+  (__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b))
 
-#define _mm_mask_ror_epi64(W, U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
-                                        (__v2di)(__m128i)(W), (__mmask8)(U)); })
+#define _mm_mask_ror_epi64(w, u, a, b) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+                                      (__v2di)_mm_ror_epi64((a), (b)), \
+                                      (__v2di)(__m128i)(w))
 
-#define _mm_maskz_ror_epi64(U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
-                                        (__v2di)_mm_setzero_di(), \
-                                        (__mmask8)(U)); })
+#define _mm_maskz_ror_epi64(u, a, b) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+                                      (__v2di)_mm_ror_epi64((a), (b)), \
+                                      (__v2di)_mm_setzero_si128())
 
-#define _mm256_ror_epi64(A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
-                                        (__v4di)_mm256_setzero_si256(), \
-                                        (__mmask8)-1); })
+#define _mm256_ror_epi64(a, b) \
+  (__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b))
 
-#define _mm256_mask_ror_epi64(W, U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
-                                        (__v4di)(__m256i)(W), (__mmask8)(U)); })
+#define _mm256_mask_ror_epi64(w, u, a, b) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+                                      (__v4di)_mm256_ror_epi64((a), (b)), \
+                                      (__v4di)(__m256i)(w))
 
-#define _mm256_maskz_ror_epi64(U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
-                                        (__v4di)_mm256_setzero_si256(), \
-                                        (__mmask8)(U)); })
+#define _mm256_maskz_ror_epi64(u, a, b) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+                                      (__v4di)_mm256_ror_epi64((a), (b)), \
+                                      (__v4di)_mm256_setzero_si256())
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sll_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sll_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sll_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sll_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-#define _mm_mask_slli_epi32(W, U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm_maskz_slli_epi32(U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_mask_slli_epi32(W, U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_maskz_slli_epi32(U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-#define _mm_mask_slli_epi64(W, U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \
-                                         (__v2di)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sll_epi64(__A, __B),
+                                             (__v2di)__W);
+}
 
-#define _mm_maskz_slli_epi64(U, A, B) __extension__ ({ \
-  (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \
-                                         (__v2di)_mm_setzero_di(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sll_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
+}
 
-#define _mm256_mask_slli_epi64(W, U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \
-                                         (__v4di)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sll_epi64(__A, __B),
+                                             (__v4di)__W);
+}
 
-#define _mm256_maskz_slli_epi64(U, A, B) __extension__ ({ \
-  (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sll_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)__W);
+}
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_rorv_epi32 (__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si) __W,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                             (__v4si)_mm_rorv_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
-              (__v4si) __B,
-              (__v4si)
-              _mm_setzero_si128 (),
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                             (__v4si)_mm_rorv_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_rorv_epi32 (__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si) __W,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                            (__v8si)_mm256_rorv_epi32(__A, __B),
+                                            (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
-              (__v8si) __B,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                            (__v8si)_mm256_rorv_epi32(__A, __B),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_rorv_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_di (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-         __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di) __W,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128(__U,
+                                             (__v2di)_mm_rorv_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
-              (__v2di) __B,
-              (__v2di)
-              _mm_setzero_di (),
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128(__U,
+                                             (__v2di)_mm_rorv_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_rorv_epi64 (__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-      __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di) __W,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256(__U,
+                                            (__v4di)_mm256_rorv_epi64(__A, __B),
+                                            (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
-              (__v4di) __B,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256(__U,
+                                            (__v4di)_mm256_rorv_epi64(__A, __B),
+                                            (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
-             (__v2di) __Y,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sllv_epi64(__X, __Y),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
-             (__v2di) __Y,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sllv_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
-             (__v4di) __Y,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
+                                            (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
-             (__v4di) __Y,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
+                                            (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sllv_epi32(__X, __Y),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sllv_epi32(__X, __Y),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
+                                            (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
-             (__v2di) __Y,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srlv_epi64(__X, __Y),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
-             (__v2di) __Y,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srlv_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
-             (__v4di) __Y,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
+                                            (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
-             (__v4di) __Y,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
+                                            (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srlv_epi32(__X, __Y),
+                                            (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srlv_epi32(__X, __Y),
+                                            (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
+                                            (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srl_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srl_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srl_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srl_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srli_epi32(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm_maskz_srli_epi32(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_mask_srli_epi32(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_maskz_srli_epi32(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srli_epi64(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)(__m128i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm_maskz_srli_epi64(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_mask_srli_epi64(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)(__m256i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_maskz_srli_epi64(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srl_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
-             (__v4si) __Y,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srl_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srl_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
-             (__v8si) __Y,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srl_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srav_epi64 (__m128i __X, __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di)
-              _mm_setzero_di (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
-         __m128i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di) __W,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di)
-              _mm_setzero_di (),
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srav_epi64 (__m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
-      __m256i __Y)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di) __W,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srav_epi32(__X, __Y),
+                                            (__v4si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srav_epi32(__X, __Y),
+                                            (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srav_epi32(__X, __Y),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srav_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srav_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srav_epi64(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srav_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srav_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srav_epi64(__X, __Y),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srav_epi64(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
@@ -5757,7 +4964,7 @@
                  (__v4si) __W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
@@ -5766,7 +4973,7 @@
 }
 
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
@@ -5774,7 +4981,7 @@
                  (__v8si) __W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
@@ -5782,7 +4989,7 @@
                  (__v8si) _mm256_setzero_si256 ());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
@@ -5791,7 +4998,7 @@
               __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
@@ -5801,7 +5008,7 @@
               __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
@@ -5810,7 +5017,7 @@
               __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
@@ -5820,7 +5027,7 @@
               __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
 {
   __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
@@ -5828,7 +5035,7 @@
           (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
 {
   __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
@@ -5836,7 +5043,7 @@
           (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
@@ -5844,15 +5051,15 @@
                  (__v2di) __W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
 {
   return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
                  (__v2di) __A,
-                 (__v2di) _mm_setzero_di ());
+                 (__v2di) _mm_setzero_si128 ());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
@@ -5860,7 +5067,7 @@
                  (__v4di) __W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
 {
   return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
@@ -5868,7 +5075,7 @@
                  (__v4di) _mm256_setzero_si256 ());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
@@ -5877,17 +5084,17 @@
               __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
               (__v2di)
-              _mm_setzero_di (),
+              _mm_setzero_si128 (),
               (__mmask8)
               __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
@@ -5896,7 +5103,7 @@
               __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
@@ -5906,7 +5113,7 @@
               __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
 {
   __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
@@ -5914,7 +5121,7 @@
           (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
 {
   __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
@@ -5922,7 +5129,7 @@
           (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
 {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
@@ -5930,7 +5137,7 @@
                                               (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
 {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
@@ -5938,7 +5145,7 @@
                                               (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
 {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
@@ -5946,7 +5153,7 @@
                                               (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
 {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
@@ -5954,132 +5161,144 @@
                                               (__v4df)_mm256_setzero_pd());
 }
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
+{
+   return (__m128i)__builtin_ia32_selectd_128(__M,
+                                              (__v4si) _mm_set1_epi32(__A),
+                                              (__v4si)__O);
+}
 
-#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \
-  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
-                                                  (__v4si)(__m128i)(O), \
-                                                  (__mmask8)(M)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+   return (__m128i)__builtin_ia32_selectd_128(__M,
+                                              (__v4si) _mm_set1_epi32(__A),
+                                              (__v4si)_mm_setzero_si128());
+}
 
-#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \
-  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
-                                                  (__v4si)_mm_setzero_si128(), \
-                                                  (__mmask8)(M)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
+{
+   return (__m256i)__builtin_ia32_selectd_256(__M,
+                                              (__v8si) _mm256_set1_epi32(__A),
+                                              (__v8si)__O);
+}
 
-#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \
-  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
-                                                  (__v8si)(__m256i)(O), \
-                                                  (__mmask8)(M)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+   return (__m256i)__builtin_ia32_selectd_256(__M,
+                                              (__v8si) _mm256_set1_epi32(__A),
+                                              (__v8si)_mm256_setzero_si256());
+}
 
-#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \
-  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
-                                                  (__v8si)_mm256_setzero_si256(), \
-                                                  (__mmask8)(M)); })
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
-                 __M);
+  return (__m128i) __builtin_ia32_selectq_128(__M,
+                                              (__v2di) _mm_set1_epi64x(__A),
+                                              (__v2di) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A,
-                 (__v2di)
-                 _mm_setzero_si128 (),
-                 __M);
+  return (__m128i) __builtin_ia32_selectq_128(__M,
+                                              (__v2di) _mm_set1_epi64x(__A),
+                                              (__v2di) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
-                 __M);
+  return (__m256i) __builtin_ia32_selectq_256(__M,
+                                              (__v4di) _mm256_set1_epi64x(__A),
+                                              (__v4di) __O) ;
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
-                 (__v4di)
-                 _mm256_setzero_si256 (),
-                 __M);
+   return (__m256i) __builtin_ia32_selectq_256(__M,
+                                               (__v4di) _mm256_set1_epi64x(__A),
+                                               (__v4di) _mm256_setzero_si256());
 }
 
-#define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+#define _mm_fixupimm_pd(A, B, C, imm) \
   (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2di)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
-#define _mm_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+#define _mm_mask_fixupimm_pd(A, U, B, C, imm) \
   (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2di)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \
   (__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
                                               (__v2df)(__m128d)(B), \
                                               (__v2di)(__m128i)(C), \
-                                              (int)(imm), (__mmask8)(U)); })
+                                              (int)(imm), (__mmask8)(U))
 
-#define _mm256_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+#define _mm256_fixupimm_pd(A, B, C, imm) \
   (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
                                              (__v4df)(__m256d)(B), \
                                              (__v4di)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)-1); })
+                                             (__mmask8)-1)
 
-#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \
   (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
                                              (__v4df)(__m256d)(B), \
                                              (__v4di)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \
   (__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
                                               (__v4df)(__m256d)(B), \
                                               (__v4di)(__m256i)(C), \
-                                              (int)(imm), (__mmask8)(U)); })
+                                              (int)(imm), (__mmask8)(U))
 
-#define _mm_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+#define _mm_fixupimm_ps(A, B, C, imm) \
   (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4si)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+#define _mm_mask_fixupimm_ps(A, U, B, C, imm) \
   (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4si)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \
   (__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
                                              (__v4sf)(__m128)(B), \
                                              (__v4si)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm256_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+#define _mm256_fixupimm_ps(A, B, C, imm) \
   (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
                                             (__v8sf)(__m256)(B), \
                                             (__v8si)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \
   (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
                                             (__v8sf)(__m256)(B), \
                                             (__v8si)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \
   (__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
                                              (__v8sf)(__m256)(B), \
                                              (__v8si)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
 {
   return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
@@ -6087,7 +5306,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_load_pd (__mmask8 __U, void const *__P)
 {
   return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
@@ -6096,7 +5315,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
 {
   return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
@@ -6104,7 +5323,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_load_pd (__mmask8 __U, void const *__P)
 {
   return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
@@ -6113,7 +5332,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
 {
   return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
@@ -6121,7 +5340,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_load_ps (__mmask8 __U, void const *__P)
 {
   return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
@@ -6130,7 +5349,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
 {
   return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
@@ -6138,7 +5357,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_load_ps (__mmask8 __U, void const *__P)
 {
   return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
@@ -6147,7 +5366,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
@@ -6155,7 +5374,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
@@ -6164,7 +5383,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
@@ -6172,7 +5391,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
@@ -6181,7 +5400,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
@@ -6189,7 +5408,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
 {
   return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
@@ -6198,7 +5417,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
@@ -6206,7 +5425,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
 {
   return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
@@ -6215,7 +5434,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
 {
   return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
@@ -6223,7 +5442,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
 {
   return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
@@ -6232,7 +5451,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
 {
   return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
@@ -6240,7 +5459,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
 {
   return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
@@ -6249,7 +5468,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
 {
   return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
@@ -6257,7 +5476,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
 {
   return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
@@ -6266,7 +5485,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
 {
   return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
@@ -6274,7 +5493,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
 {
   return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
@@ -6283,7 +5502,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
 {
   __builtin_ia32_storeapd128_mask ((__v2df *) __P,
@@ -6291,7 +5510,7 @@
            (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
 {
   __builtin_ia32_storeapd256_mask ((__v4df *) __P,
@@ -6299,7 +5518,7 @@
            (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
 {
   __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
@@ -6307,7 +5526,7 @@
            (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
 {
   __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
@@ -6315,7 +5534,7 @@
            (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
 {
   __builtin_ia32_storedqudi128_mask ((__v2di *) __P,
@@ -6323,7 +5542,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
 {
   __builtin_ia32_storedqudi256_mask ((__v4di *) __P,
@@ -6331,7 +5550,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
 {
   __builtin_ia32_storedqusi128_mask ((__v4si *) __P,
@@ -6339,7 +5558,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
 {
   __builtin_ia32_storedqusi256_mask ((__v8si *) __P,
@@ -6347,7 +5566,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
 {
   __builtin_ia32_storeupd128_mask ((__v2df *) __P,
@@ -6355,7 +5574,7 @@
            (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
 {
   __builtin_ia32_storeupd256_mask ((__v4df *) __P,
@@ -6363,7 +5582,7 @@
            (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
 {
   __builtin_ia32_storeups128_mask ((__v4sf *) __P,
@@ -6371,7 +5590,7 @@
            (__mmask8) __U);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
 {
   __builtin_ia32_storeups256_mask ((__v8sf *) __P,
@@ -6380,7 +5599,7 @@
 }
 
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
@@ -6388,7 +5607,7 @@
                                               (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
@@ -6396,7 +5615,7 @@
                                               (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
 {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
@@ -6404,7 +5623,7 @@
                                            (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
 {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
@@ -6412,7 +5631,7 @@
                                            (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
@@ -6420,7 +5639,7 @@
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
 {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
@@ -6428,7 +5647,7 @@
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
 {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
@@ -6436,7 +5655,7 @@
                                            (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
 {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
@@ -6444,7 +5663,7 @@
                                            (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
@@ -6452,7 +5671,7 @@
                                               (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
@@ -6460,7 +5679,7 @@
                                               (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
 {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
@@ -6468,7 +5687,7 @@
                                            (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
 {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
@@ -6476,7 +5695,7 @@
                                            (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
@@ -6484,7 +5703,7 @@
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
 {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
@@ -6492,7 +5711,7 @@
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
 {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
@@ -6500,7 +5719,7 @@
                                            (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
 {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
@@ -6508,7 +5727,7 @@
                                            (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_rcp14_pd (__m128d __A)
 {
   return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
@@ -6517,7 +5736,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
 {
   return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
@@ -6525,7 +5744,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
 {
   return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
@@ -6534,7 +5753,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_rcp14_pd (__m256d __A)
 {
   return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
@@ -6543,7 +5762,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
 {
   return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
@@ -6551,7 +5770,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
 {
   return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
@@ -6560,7 +5779,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_rcp14_ps (__m128 __A)
 {
   return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
@@ -6569,7 +5788,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
 {
   return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
@@ -6577,7 +5796,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
 {
   return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
@@ -6586,7 +5805,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_rcp14_ps (__m256 __A)
 {
   return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
@@ -6595,7 +5814,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
 {
   return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
@@ -6603,7 +5822,7 @@
                (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
 {
   return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
@@ -6612,251 +5831,219 @@
                (__mmask8) __U);
 }
 
-#define _mm_mask_permute_pd(W, U, X, C) __extension__ ({ \
+#define _mm_mask_permute_pd(W, U, X, C) \
   (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
                                        (__v2df)_mm_permute_pd((X), (C)), \
-                                       (__v2df)(__m128d)(W)); })
+                                       (__v2df)(__m128d)(W))
 
-#define _mm_maskz_permute_pd(U, X, C) __extension__ ({ \
+#define _mm_maskz_permute_pd(U, X, C) \
   (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
                                        (__v2df)_mm_permute_pd((X), (C)), \
-                                       (__v2df)_mm_setzero_pd()); })
+                                       (__v2df)_mm_setzero_pd())
 
-#define _mm256_mask_permute_pd(W, U, X, C) __extension__ ({ \
+#define _mm256_mask_permute_pd(W, U, X, C) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                        (__v4df)_mm256_permute_pd((X), (C)), \
-                                       (__v4df)(__m256d)(W)); })
+                                       (__v4df)(__m256d)(W))
 
-#define _mm256_maskz_permute_pd(U, X, C) __extension__ ({ \
+#define _mm256_maskz_permute_pd(U, X, C) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                        (__v4df)_mm256_permute_pd((X), (C)), \
-                                       (__v4df)_mm256_setzero_pd()); })
+                                       (__v4df)_mm256_setzero_pd())
 
-#define _mm_mask_permute_ps(W, U, X, C) __extension__ ({ \
+#define _mm_mask_permute_ps(W, U, X, C) \
   (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
                                       (__v4sf)_mm_permute_ps((X), (C)), \
-                                      (__v4sf)(__m128)(W)); })
+                                      (__v4sf)(__m128)(W))
 
-#define _mm_maskz_permute_ps(U, X, C) __extension__ ({ \
+#define _mm_maskz_permute_ps(U, X, C) \
   (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
                                       (__v4sf)_mm_permute_ps((X), (C)), \
-                                      (__v4sf)_mm_setzero_ps()); })
+                                      (__v4sf)_mm_setzero_ps())
 
-#define _mm256_mask_permute_ps(W, U, X, C) __extension__ ({ \
+#define _mm256_mask_permute_ps(W, U, X, C) \
   (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
                                       (__v8sf)_mm256_permute_ps((X), (C)), \
-                                      (__v8sf)(__m256)(W)); })
+                                      (__v8sf)(__m256)(W))
 
-#define _mm256_maskz_permute_ps(U, X, C) __extension__ ({ \
+#define _mm256_maskz_permute_ps(U, X, C) \
   (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
                                       (__v8sf)_mm256_permute_ps((X), (C)), \
-                                      (__v8sf)_mm256_setzero_ps()); })
+                                      (__v8sf)_mm256_setzero_ps())
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
-      __m128i __C)
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
 {
-  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
-                 (__v2di) __C,
-                 (__v2df) __W,
-                 (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                            (__v2df)_mm_permutevar_pd(__A, __C),
+                                            (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C)
 {
-  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
-                 (__v2di) __C,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) __U);
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                            (__v2df)_mm_permutevar_pd(__A, __C),
+                                            (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
-         __m256i __C)
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C)
 {
-  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
-              (__v4di) __C,
-              (__v4df) __W,
-              (__mmask8)
-              __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                         (__v4df)_mm256_permutevar_pd(__A, __C),
+                                         (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C)
 {
-  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
-              (__v4di) __C,
-              (__v4df)
-              _mm256_setzero_pd (),
-              (__mmask8)
-              __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                         (__v4df)_mm256_permutevar_pd(__A, __C),
+                                         (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
-      __m128i __C)
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C)
 {
-  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
-                (__v4si) __C,
-                (__v4sf) __W,
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                            (__v4sf)_mm_permutevar_ps(__A, __C),
+                                            (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C)
 {
-  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
-                (__v4si) __C,
-                (__v4sf)
-                _mm_setzero_ps (),
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                            (__v4sf)_mm_permutevar_ps(__A, __C),
+                                            (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
-         __m256i __C)
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C)
 {
-  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
-                   (__v8si) __C,
-                   (__v8sf) __W,
-                   (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
+                                          (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C)
 {
-  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
-                   (__v8si) __C,
-                   (__v8sf)
-                   _mm256_setzero_ps (),
-                   (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
+                                          (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_test_epi32_mask (__m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
-                 (__v4si) __B,
-                 (__mmask8) -1);
+  return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
-                 (__v4si) __B, __U);
+  return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B),
+                                     _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_test_epi32_mask (__m256i __A, __m256i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
-                 (__v8si) __B,
-                 (__mmask8) -1);
+  return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B),
+                                   _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
-                 (__v8si) __B, __U);
+  return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B),
+                                        _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_test_epi64_mask (__m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
-                 (__v2di) __B,
-                 (__mmask8) -1);
+  return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
-                 (__v2di) __B, __U);
+  return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B),
+                                     _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_test_epi64_mask (__m256i __A, __m256i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
-                 (__v4di) __B,
-                 (__mmask8) -1);
+  return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B),
+                                   _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
-                 (__v4di) __B, __U);
+  return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B),
+                                        _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_testn_epi32_mask (__m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
-            (__v4si) __B,
-            (__mmask8) -1);
+  return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
-            (__v4si) __B, __U);
+  return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B),
+                                    _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_testn_epi32_mask (__m256i __A, __m256i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
-            (__v8si) __B,
-            (__mmask8) -1);
+  return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B),
+                                  _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
-            (__v8si) __B, __U);
+  return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B),
+                                       _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_testn_epi64_mask (__m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
-            (__v2di) __B,
-            (__mmask8) -1);
+  return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
-            (__v2di) __B, __U);
+  return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B),
+                                    _mm_setzero_si128());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_testn_epi64_mask (__m256i __A, __m256i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
-            (__v4di) __B,
-            (__mmask8) -1);
+  return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B),
+                                  _mm256_setzero_si256());
 }
 
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
 _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
-            (__v4di) __B, __U);
+  return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B),
+                                       _mm256_setzero_si256());
 }
 
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
@@ -6864,7 +6051,7 @@
                                            (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
@@ -6872,7 +6059,7 @@
                                            (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
@@ -6880,7 +6067,7 @@
                                         (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
@@ -6888,7 +6075,7 @@
                                         (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
@@ -6896,15 +6083,15 @@
                                            (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpackhi_epi64(__A, __B),
-                                           (__v2di)_mm_setzero_di());
+                                           (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
@@ -6912,7 +6099,7 @@
                                         (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
@@ -6920,7 +6107,7 @@
                                         (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
@@ -6928,7 +6115,7 @@
                                            (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
@@ -6936,7 +6123,7 @@
                                            (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
@@ -6944,7 +6131,7 @@
                                         (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
@@ -6952,7 +6139,7 @@
                                         (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
@@ -6960,15 +6147,15 @@
                                            (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpacklo_epi64(__A, __B),
-                                           (__v2di)_mm_setzero_di());
+                                           (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
@@ -6976,7 +6163,7 @@
                                         (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
@@ -6984,352 +6171,330 @@
                                         (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sra_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
-             (__v4si) __B,
-             (__v4si)
-             _mm_setzero_si128 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sra_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si) __W,
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sra_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
-             (__v4si) __B,
-             (__v8si)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sra_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-#define _mm_mask_srai_epi32(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm_maskz_srai_epi32(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_mask_srai_epi32(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_maskz_srai_epi32(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sra_epi64 (__m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) -1);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-             (__v2di) __B,
-             (__v2di)
-             _mm_setzero_di (),
-             (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_sra_epi64 (__m256i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) -1);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-           __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_sra_epi64(__m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di) __W,
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-             (__v2di) __B,
-             (__v4di)
-             _mm256_setzero_si256 (),
-             (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                             (__v2di)_mm_sra_epi64(__A, __B), \
+                                             (__v2di)__W);
 }
 
-#define _mm_srai_epi64(A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)_mm_setzero_di(), \
-                                         (__mmask8)-1); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                             (__v2di)_mm_sra_epi64(__A, __B), \
+                                             (__v2di)_mm_setzero_si128());
+}
 
-#define _mm_mask_srai_epi64(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sra_epi64(__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B);
+}
 
-#define _mm_maskz_srai_epi64(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
-                                         (__v2di)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                           (__v4di)_mm256_sra_epi64(__A, __B), \
+                                           (__v4di)__W);
+}
 
-#define _mm256_srai_epi64(A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)-1); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                           (__v4di)_mm256_sra_epi64(__A, __B), \
+                                           (__v4di)_mm256_setzero_si256());
+}
 
-#define _mm256_mask_srai_epi64(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srai_epi64(__m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
+}
 
-#define _mm256_maskz_srai_epi64(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                           (__v2di)_mm_srai_epi64(__A, __imm), \
+                                           (__v2di)__W);
+}
 
-#define _mm_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                           (__v2di)_mm_srai_epi64(__A, __imm), \
+                                           (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srai_epi64(__m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+#define _mm_ternarylogic_epi32(A, B, C, imm) \
   (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
                                             (__v4si)(__m128i)(B), \
                                             (__v4si)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \
   (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
                                             (__v4si)(__m128i)(B), \
                                             (__v4si)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \
   (__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
                                              (__v4si)(__m128i)(B), \
                                              (__v4si)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm256_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+#define _mm256_ternarylogic_epi32(A, B, C, imm) \
   (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
                                             (__v8si)(__m256i)(B), \
                                             (__v8si)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \
   (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
                                             (__v8si)(__m256i)(B), \
                                             (__v8si)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \
   (__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
                                              (__v8si)(__m256i)(B), \
                                              (__v8si)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+#define _mm_ternarylogic_epi64(A, B, C, imm) \
   (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
                                             (__v2di)(__m128i)(B), \
                                             (__v2di)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \
   (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
                                             (__v2di)(__m128i)(B), \
                                             (__v2di)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \
   (__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
                                              (__v2di)(__m128i)(B), \
                                              (__v2di)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
-#define _mm256_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+#define _mm256_ternarylogic_epi64(A, B, C, imm) \
   (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
                                             (__v4di)(__m256i)(B), \
                                             (__v4di)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \
   (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
                                             (__v4di)(__m256i)(B), \
                                             (__v4di)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \
   (__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
                                              (__v4di)(__m256i)(B), \
                                              (__v4di)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U)); })
+                                             (__mmask8)(U))
 
 
 
-#define _mm256_shuffle_f32x4(A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
-                                             (__v8sf)(__m256)(B), (int)(imm), \
-                                             (__v8sf)_mm256_setzero_ps(), \
-                                             (__mmask8)-1); })
+#define _mm256_shuffle_f32x4(A, B, imm) \
+  (__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
+                                        (__v8sf)(__m256)(B), (int)(imm))
 
-#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
-                                             (__v8sf)(__m256)(B), (int)(imm), \
-                                             (__v8sf)(__m256)(W), \
-                                             (__mmask8)(U)); })
+#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
+                                      (__v8sf)(__m256)(W))
 
-#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
-                                             (__v8sf)(__m256)(B), (int)(imm), \
-                                             (__v8sf)_mm256_setzero_ps(), \
-                                             (__mmask8)(U)); })
+#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
+                                      (__v8sf)_mm256_setzero_ps())
 
-#define _mm256_shuffle_f64x2(A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
-                                              (__v4df)(__m256d)(B), \
-                                              (int)(imm), \
-                                              (__v4df)_mm256_setzero_pd(), \
-                                              (__mmask8)-1); })
+#define _mm256_shuffle_f64x2(A, B, imm) \
+  (__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \
+                                         (__v4df)(__m256d)(B), (int)(imm))
 
-#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
-                                              (__v4df)(__m256d)(B), \
-                                              (int)(imm), \
-                                              (__v4df)(__m256d)(W), \
-                                              (__mmask8)(U)); })
+#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                      (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
+                                      (__v4df)(__m256d)(W))
 
-#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
-  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
-                                              (__v4df)(__m256d)(B), \
-                                              (int)(imm), \
-                                              (__v4df)_mm256_setzero_pd(), \
-                                              (__mmask8)(U)); })
+#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                      (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
+                                      (__v4df)_mm256_setzero_pd())
 
-#define _mm256_shuffle_i32x4(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
-                                              (__v8si)(__m256i)(B), \
-                                              (int)(imm), \
-                                              (__v8si)_mm256_setzero_si256(), \
-                                              (__mmask8)-1); })
+#define _mm256_shuffle_i32x4(A, B, imm) \
+  (__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \
+                                         (__v8si)(__m256i)(B), (int)(imm))
 
-#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
-                                              (__v8si)(__m256i)(B), \
-                                              (int)(imm), \
-                                              (__v8si)(__m256i)(W), \
-                                              (__mmask8)(U)); })
+#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
+                                      (__v8si)(__m256i)(W))
 
-#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
-                                              (__v8si)(__m256i)(B), \
-                                              (int)(imm), \
-                                              (__v8si)_mm256_setzero_si256(), \
-                                              (__mmask8)(U)); })
+#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
+                                      (__v8si)_mm256_setzero_si256())
 
-#define _mm256_shuffle_i64x2(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
-                                              (__v4di)(__m256i)(B), \
-                                              (int)(imm), \
-                                              (__v4di)_mm256_setzero_si256(), \
-                                              (__mmask8)-1); })
+#define _mm256_shuffle_i64x2(A, B, imm) \
+  (__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \
+                                         (__v4di)(__m256i)(B), (int)(imm))
 
-#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
-                                              (__v4di)(__m256i)(B), \
-                                              (int)(imm), \
-                                              (__v4di)(__m256i)(W), \
-                                              (__mmask8)(U)); })
+#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                      (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
+                                      (__v4di)(__m256i)(W))
 
-#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
-                                              (__v4di)(__m256i)(B), \
-                                              (int)(imm), \
-                                              (__v4di)_mm256_setzero_si256(), \
-                                              (__mmask8)(U)); })
 
-#define _mm_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                      (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
+                                      (__v4di)_mm256_setzero_si256())
+
+#define _mm_mask_shuffle_pd(W, U, A, B, M) \
   (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
                                        (__v2df)_mm_shuffle_pd((A), (B), (M)), \
-                                       (__v2df)(__m128d)(W)); })
+                                       (__v2df)(__m128d)(W))
 
-#define _mm_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+#define _mm_maskz_shuffle_pd(U, A, B, M) \
   (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
                                        (__v2df)_mm_shuffle_pd((A), (B), (M)), \
-                                       (__v2df)_mm_setzero_pd()); })
+                                       (__v2df)_mm_setzero_pd())
 
-#define _mm256_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+#define _mm256_mask_shuffle_pd(W, U, A, B, M) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                        (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
-                                       (__v4df)(__m256d)(W)); })
+                                       (__v4df)(__m256d)(W))
 
-#define _mm256_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+#define _mm256_maskz_shuffle_pd(U, A, B, M) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                        (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
-                                       (__v4df)_mm256_setzero_pd()); })
+                                       (__v4df)_mm256_setzero_pd())
 
-#define _mm_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+#define _mm_mask_shuffle_ps(W, U, A, B, M) \
   (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
                                       (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
-                                      (__v4sf)(__m128)(W)); })
+                                      (__v4sf)(__m128)(W))
 
-#define _mm_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+#define _mm_maskz_shuffle_ps(U, A, B, M) \
   (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
                                       (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
-                                      (__v4sf)_mm_setzero_ps()); })
+                                      (__v4sf)_mm_setzero_ps())
 
-#define _mm256_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+#define _mm256_mask_shuffle_ps(W, U, A, B, M) \
   (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
                                       (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
-                                      (__v8sf)(__m256)(W)); })
+                                      (__v8sf)(__m256)(W))
 
-#define _mm256_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+#define _mm256_maskz_shuffle_ps(U, A, B, M) \
   (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
                                       (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
-                                      (__v8sf)_mm256_setzero_ps()); })
+                                      (__v8sf)_mm256_setzero_ps())
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_rsqrt14_pd (__m128d __A)
 {
   return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
@@ -7338,7 +6503,7 @@
                  (__mmask8) -1);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
 {
   return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
@@ -7346,7 +6511,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
 {
   return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
@@ -7355,7 +6520,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_rsqrt14_pd (__m256d __A)
 {
   return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
@@ -7364,7 +6529,7 @@
                  (__mmask8) -1);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
 {
   return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
@@ -7372,7 +6537,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
 {
   return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
@@ -7381,7 +6546,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_rsqrt14_ps (__m128 __A)
 {
   return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
@@ -7390,7 +6555,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
 {
   return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
@@ -7398,7 +6563,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
 {
   return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
@@ -7407,7 +6572,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_rsqrt14_ps (__m256 __A)
 {
   return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
@@ -7416,7 +6581,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
 {
   return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
@@ -7424,7 +6589,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
 {
   return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
@@ -7433,56 +6598,53 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_broadcast_f32x4 (__m128 __A)
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_broadcast_f32x4(__m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
-                (__v8sf)_mm256_undefined_pd (),
-                (__mmask8) -1);
+  return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+                                         0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
-                (__v8sf) __O,
-                __M);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+                                            (__v8sf)_mm256_broadcast_f32x4(__A),
+                                            (__v8sf)__O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
-                (__v8sf) _mm256_setzero_ps (),
-                __M);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+                                            (__v8sf)_mm256_broadcast_f32x4(__A),
+                                            (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_broadcast_i32x4 (__m128i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcast_i32x4(__m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
-                 (__v8si)_mm256_undefined_si256 (),
-                 (__mmask8) -1);
+  return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
-                 (__v8si)
-                 __O, __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                            (__v8si)_mm256_broadcast_i32x4(__A),
+                                            (__v8si)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
-                 __A,
-                 (__v8si) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                            (__v8si)_mm256_broadcast_i32x4(__A),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
 {
   return (__m256d)__builtin_ia32_selectpd_256(__M,
@@ -7490,7 +6652,7 @@
                                               (__v4df) __O);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
 {
   return (__m256d)__builtin_ia32_selectpd_256(__M,
@@ -7498,7 +6660,7 @@
                                               (__v4df) _mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
 {
   return (__m128)__builtin_ia32_selectps_128(__M,
@@ -7506,7 +6668,7 @@
                                              (__v4sf) __O);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
 {
   return (__m128)__builtin_ia32_selectps_128(__M,
@@ -7514,7 +6676,7 @@
                                              (__v4sf) _mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
 {
   return (__m256)__builtin_ia32_selectps_256(__M,
@@ -7522,7 +6684,7 @@
                                              (__v8sf) __O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
 {
   return (__m256)__builtin_ia32_selectps_256(__M,
@@ -7530,7 +6692,7 @@
                                              (__v8sf) _mm256_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectd_128(__M,
@@ -7538,7 +6700,7 @@
                                              (__v4si) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectd_128(__M,
@@ -7546,7 +6708,7 @@
                                              (__v4si) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectd_256(__M,
@@ -7554,7 +6716,7 @@
                                              (__v8si) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectd_256(__M,
@@ -7562,7 +6724,7 @@
                                              (__v8si) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectq_128(__M,
@@ -7570,7 +6732,7 @@
                                              (__v2di) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectq_128(__M,
@@ -7578,7 +6740,7 @@
                                              (__v2di) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectq_256(__M,
@@ -7586,7 +6748,7 @@
                                              (__v4di) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectq_256(__M,
@@ -7594,7 +6756,7 @@
                                              (__v4di) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtsepi32_epi8 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
@@ -7602,14 +6764,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
                (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
@@ -7617,13 +6779,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm256_cvtsepi32_epi8 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
@@ -7631,14 +6793,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
                (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
@@ -7646,13 +6808,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtsepi32_epi16 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
@@ -7660,7 +6822,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
@@ -7668,7 +6830,7 @@
                __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
@@ -7676,13 +6838,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtsepi32_epi16 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
@@ -7690,14 +6852,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
                (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
@@ -7705,13 +6867,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtsepi64_epi8 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
@@ -7719,14 +6881,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
                (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
@@ -7734,13 +6896,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtsepi64_epi8 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
@@ -7748,14 +6910,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
                (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
@@ -7763,13 +6925,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtsepi64_epi32 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
@@ -7777,14 +6939,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
                (__v4si) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
@@ -7792,13 +6954,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtsepi64_epi32 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
@@ -7806,7 +6968,7 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
@@ -7814,7 +6976,7 @@
                __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
@@ -7822,13 +6984,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtsepi64_epi16 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
@@ -7836,14 +6998,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
                (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
@@ -7851,13 +7013,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtsepi64_epi16 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
@@ -7865,14 +7027,14 @@
                (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
                (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
@@ -7880,13 +7042,13 @@
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtusepi32_epi8 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
@@ -7894,7 +7056,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
@@ -7902,7 +7064,7 @@
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
@@ -7910,13 +7072,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtusepi32_epi8 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
@@ -7924,7 +7086,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
@@ -7932,7 +7094,7 @@
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
@@ -7940,13 +7102,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtusepi32_epi16 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
@@ -7954,14 +7116,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
                 (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
@@ -7969,13 +7131,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtusepi32_epi16 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
@@ -7983,14 +7145,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
                 (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
@@ -7998,13 +7160,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtusepi64_epi8 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
@@ -8012,7 +7174,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
@@ -8020,7 +7182,7 @@
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
@@ -8028,13 +7190,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtusepi64_epi8 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
@@ -8042,7 +7204,7 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
@@ -8050,7 +7212,7 @@
                 __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
@@ -8058,13 +7220,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtusepi64_epi32 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
@@ -8072,14 +7234,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
                 (__v4si) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
@@ -8087,13 +7249,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtusepi64_epi32 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
@@ -8101,14 +7263,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
                 (__v4si) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
@@ -8116,13 +7278,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtusepi64_epi16 (__m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
@@ -8130,14 +7292,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
                 (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
@@ -8145,13 +7307,13 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtusepi64_epi16 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
@@ -8159,14 +7321,14 @@
                 (__mmask8) -1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
                 (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
@@ -8174,28 +7336,28 @@
                 __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 {
-  return __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+  __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtepi32_epi8 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
-              (__v16qi)_mm_undefined_si128(),
-              (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
+      2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
               (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
@@ -8204,28 +7366,29 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi32_epi8 (__m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
-              (__v16qi)_mm_undefined_si128(),
-              (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v8si)__A, __v8qi),
+      (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+      12, 13, 14, 15);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
               (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
@@ -8233,28 +7396,28 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtepi32_epi16 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
-              (__v8hi) _mm_setzero_si128 (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
+      2, 3, 4, 5, 6, 7);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
               (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
@@ -8262,28 +7425,26 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi32_epi16 (__m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
-              (__v8hi)_mm_setzero_si128 (),
-              (__mmask8) -1);
+  return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
               (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
@@ -8291,28 +7452,28 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtepi64_epi8 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
-              (__v16qi) _mm_undefined_si128(),
-              (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3,
+      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
               (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
@@ -8320,28 +7481,28 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi64_epi8 (__m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
-              (__v16qi) _mm_undefined_si128(),
-              (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
+      2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
               (__v16qi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
@@ -8349,28 +7510,27 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtepi64_epi32 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
-              (__v4si)_mm_undefined_si128(),
-              (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
               (__v4si) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
@@ -8378,50 +7538,49 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi64_epi32 (__m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
-              (__v4si) _mm_undefined_si128(),
-              (__mmask8) -1);
+  return (__m128i)__builtin_convertvector((__v4di)__A, __v4si);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
-              (__v4si) __O, __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm256_cvtepi64_epi32(__A),
+                                             (__v4si)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
-              (__v4si) _mm_setzero_si128 (),
-              __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm256_cvtepi64_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtepi64_epi16 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
-              (__v8hi) _mm_undefined_si128(),
-              (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3,
+      3, 3, 3, 3);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
@@ -8429,7 +7588,7 @@
               __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
@@ -8437,28 +7596,28 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtepi64_epi16 (__m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
-              (__v8hi)_mm_undefined_si128(),
-              (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+      __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
+      2, 3, 4, 5, 6, 7);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
               (__v8hi) __O, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
@@ -8466,472 +7625,410 @@
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
 }
 
-#define _mm256_extractf32x4_ps(A, imm) __extension__ ({ \
+#define _mm256_extractf32x4_ps(A, imm) \
   (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
                                                (int)(imm), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1); })
+                                               (__v4sf)_mm_undefined_ps(), \
+                                               (__mmask8)-1)
 
-#define _mm256_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({ \
+#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
   (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
                                                (int)(imm), \
                                                (__v4sf)(__m128)(W), \
-                                               (__mmask8)(U)); })
+                                               (__mmask8)(U))
 
-#define _mm256_maskz_extractf32x4_ps(U, A, imm) __extension__ ({ \
+#define _mm256_maskz_extractf32x4_ps(U, A, imm) \
   (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
                                                (int)(imm), \
                                                (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(U)); })
+                                               (__mmask8)(U))
 
-#define _mm256_extracti32x4_epi32(A, imm) __extension__ ({ \
+#define _mm256_extracti32x4_epi32(A, imm) \
   (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
                                                 (int)(imm), \
-                                                (__v4si)_mm_setzero_si128(), \
-                                                (__mmask8)-1); })
+                                                (__v4si)_mm_undefined_si128(), \
+                                                (__mmask8)-1)
 
-#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
+#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
   (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
                                                 (int)(imm), \
                                                 (__v4si)(__m128i)(W), \
-                                                (__mmask8)(U)); })
+                                                (__mmask8)(U))
 
-#define _mm256_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
+#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \
   (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
                                                 (int)(imm), \
                                                 (__v4si)_mm_setzero_si128(), \
-                                                (__mmask8)(U)); })
+                                                (__mmask8)(U))
 
-#define _mm256_insertf32x4(A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
-                                              (__v4sf)(__m128)(B), (int)(imm), \
-                                              (__v8sf)_mm256_setzero_ps(), \
-                                              (__mmask8)-1); })
+#define _mm256_insertf32x4(A, B, imm) \
+  (__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
+                                         (__v4sf)(__m128)(B), (int)(imm))
 
-#define _mm256_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
-                                              (__v4sf)(__m128)(B), (int)(imm), \
-                                              (__v8sf)(__m256)(W), \
-                                              (__mmask8)(U)); })
+#define _mm256_mask_insertf32x4(W, U, A, B, imm) \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+                                  (__v8sf)(__m256)(W))
 
-#define _mm256_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
-  (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
-                                              (__v4sf)(__m128)(B), (int)(imm), \
-                                              (__v8sf)_mm256_setzero_ps(), \
-                                              (__mmask8)(U)); })
+#define _mm256_maskz_insertf32x4(U, A, B, imm) \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+                                  (__v8sf)_mm256_setzero_ps())
 
-#define _mm256_inserti32x4(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
-                                               (__v4si)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8si)_mm256_setzero_si256(), \
-                                               (__mmask8)-1); })
+#define _mm256_inserti32x4(A, B, imm) \
+  (__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \
+                                          (__v4si)(__m128i)(B), (int)(imm))
 
-#define _mm256_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
-                                               (__v4si)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8si)(__m256i)(W), \
-                                               (__mmask8)(U)); })
+#define _mm256_mask_inserti32x4(W, U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+                                  (__v8si)(__m256i)(W))
 
-#define _mm256_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
-                                               (__v4si)(__m128i)(B), \
-                                               (int)(imm), \
-                                               (__v8si)_mm256_setzero_si256(), \
-                                               (__mmask8)(U)); })
+#define _mm256_maskz_inserti32x4(U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+                                  (__v8si)_mm256_setzero_si256())
 
-#define _mm_getmant_pd(A, B, C) __extension__({\
+#define _mm_getmant_pd(A, B, C) \
   (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm_mask_getmant_pd(W, U, A, B, C) __extension__({\
+#define _mm_mask_getmant_pd(W, U, A, B, C) \
   (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm_maskz_getmant_pd(U, A, B, C) __extension__({\
+#define _mm_maskz_getmant_pd(U, A, B, C) \
   (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm256_getmant_pd(A, B, C) __extension__ ({ \
+#define _mm256_getmant_pd(A, B, C) \
   (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)-1); })
+                                            (__mmask8)-1)
 
-#define _mm256_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
+#define _mm256_mask_getmant_pd(W, U, A, B, C) \
   (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v4df)(__m256d)(W), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm256_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
+#define _mm256_maskz_getmant_pd(U, A, B, C) \
   (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
                                             (int)(((C)<<2) | (B)), \
                                             (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm_getmant_ps(A, B, C) __extension__ ({ \
+#define _mm_getmant_ps(A, B, C) \
   (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1); })
+                                           (__mmask8)-1)
 
-#define _mm_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+#define _mm_mask_getmant_ps(W, U, A, B, C) \
   (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v4sf)(__m128)(W), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+#define _mm_maskz_getmant_ps(U, A, B, C) \
   (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm256_getmant_ps(A, B, C) __extension__ ({ \
+#define _mm256_getmant_ps(A, B, C) \
   (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1); })
+                                           (__mmask8)-1)
 
-#define _mm256_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+#define _mm256_mask_getmant_ps(W, U, A, B, C) \
   (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v8sf)(__m256)(W), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm256_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+#define _mm256_maskz_getmant_ps(U, A, B, C) \
   (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
                                            (int)(((C)<<2) | (B)), \
                                            (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U)); })
+                                           (__mmask8)(U))
 
-#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
   (__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
                                         (double const *)(addr), \
                                         (__v2di)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
   (__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
                                         (long long const *)(addr), \
                                         (__v2di)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
   (__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
                                         (double const *)(addr), \
                                         (__v4di)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
   (__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
                                         (long long const *)(addr), \
                                         (__v4di)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
   (__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
                                        (float const *)(addr), \
                                        (__v2di)(__m128i)(index), \
-                                       (__mmask8)(mask), (int)(scale)); })
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
   (__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
                                         (int const *)(addr), \
                                         (__v2di)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
   (__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
                                        (float const *)(addr), \
                                        (__v4di)(__m256i)(index), \
-                                       (__mmask8)(mask), (int)(scale)); })
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
   (__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
                                         (int const *)(addr), \
                                         (__v4di)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
   (__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
                                         (double const *)(addr), \
                                         (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
   (__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
                                         (long long const *)(addr), \
                                         (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
   (__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
                                         (double const *)(addr), \
                                         (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
   (__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
                                         (long long const *)(addr), \
                                         (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
   (__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
                                        (float const *)(addr), \
                                        (__v4si)(__m128i)(index), \
-                                       (__mmask8)(mask), (int)(scale)); })
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
   (__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
                                         (int const *)(addr), \
                                         (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
   (__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
                                        (float const *)(addr), \
                                        (__v8si)(__m256i)(index), \
-                                       (__mmask8)(mask), (int)(scale)); })
+                                       (__mmask8)(mask), (int)(scale))
 
-#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
   (__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
                                         (int const *)(addr), \
                                         (__v8si)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale)); })
+                                        (__mmask8)(mask), (int)(scale))
 
-#define _mm256_permutex_pd(X, C) __extension__ ({ \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(X), \
-                                   (__v4df)_mm256_undefined_pd(), \
-                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
-                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+#define _mm256_permutex_pd(X, C) \
+  (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C))
 
-#define _mm256_mask_permutex_pd(W, U, X, C) __extension__ ({ \
+#define _mm256_mask_permutex_pd(W, U, X, C) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                        (__v4df)_mm256_permutex_pd((X), (C)), \
-                                       (__v4df)(__m256d)(W)); })
+                                       (__v4df)(__m256d)(W))
 
-#define _mm256_maskz_permutex_pd(U, X, C) __extension__ ({ \
+#define _mm256_maskz_permutex_pd(U, X, C) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                        (__v4df)_mm256_permutex_pd((X), (C)), \
-                                       (__v4df)_mm256_setzero_pd()); })
+                                       (__v4df)_mm256_setzero_pd())
 
-#define _mm256_permutex_epi64(X, C) __extension__ ({ \
-  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(X), \
-                                   (__v4di)_mm256_undefined_si256(), \
-                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
-                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+#define _mm256_permutex_epi64(X, C) \
+  (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C))
 
-#define _mm256_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
+#define _mm256_mask_permutex_epi64(W, U, X, C) \
   (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                       (__v4di)_mm256_permutex_epi64((X), (C)), \
-                                      (__v4di)(__m256i)(W)); })
+                                      (__v4di)(__m256i)(W))
 
-#define _mm256_maskz_permutex_epi64(U, X, C) __extension__ ({ \
+#define _mm256_maskz_permutex_epi64(U, X, C) \
   (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                       (__v4di)_mm256_permutex_epi64((X), (C)), \
-                                      (__v4di)_mm256_setzero_si256()); })
+                                      (__v4di)_mm256_setzero_si256())
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_permutexvar_pd (__m256i __X, __m256d __Y)
 {
-  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
-                 (__v4di) __X,
-                 (__v4df) _mm256_undefined_si256 (),
-                 (__mmask8) -1);
+  return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
           __m256d __Y)
 {
-  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
-                 (__v4di) __X,
-                 (__v4df) __W,
-                 (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                        (__v4df)_mm256_permutexvar_pd(__X, __Y),
+                                        (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
 {
-  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
-                 (__v4di) __X,
-                 (__v4df) _mm256_setzero_pd (),
-                 (__mmask8) __U);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                        (__v4df)_mm256_permutexvar_pd(__X, __Y),
+                                        (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
-                 (__v4di) __X,
-                 (__v4di) _mm256_setzero_si256 (),
-                 (__mmask8) __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
-                 (__v4di) __X,
-                 (__v4di) _mm256_undefined_si256 (),
-                 (__mmask8) -1);
+  return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                     (__v4di)_mm256_permutexvar_epi64(__X, __Y),
+                                     (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
              __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
-                 (__v4di) __X,
-                 (__v4di) __W,
-                 __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                     (__v4di)_mm256_permutexvar_epi64(__X, __Y),
+                                     (__v4di)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
-          __m256 __Y)
+#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A))
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y)
 {
-  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
-                (__v8si) __X,
-                (__v8sf) __W,
-                (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                        (__v8sf)_mm256_permutexvar_ps(__X, __Y),
+                                        (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y)
 {
-  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
-                (__v8si) __X,
-                (__v8sf) _mm256_setzero_ps (),
-                (__mmask8) __U);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                        (__v8sf)_mm256_permutexvar_ps(__X, __Y),
+                                        (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_permutexvar_ps (__m256i __X, __m256 __Y)
+#define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X,
+                              __m256i __Y)
 {
-  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
-                (__v8si) __X,
-                (__v8sf) _mm256_undefined_si256 (),
-                (__mmask8) -1);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                     (__v8si)_mm256_permutexvar_epi32(__X, __Y),
+                                     (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
-                 (__v8si) __X,
-                 (__v8si) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                     (__v8si)_mm256_permutexvar_epi32(__X, __Y),
+                                     (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
-             __m256i __Y)
-{
-  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
-                 (__v8si) __X,
-                 (__v8si) __W,
-                 (__mmask8) __M);
-}
+#define _mm_alignr_epi32(A, B, imm) \
+  (__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \
+                                    (__v4si)(__m128i)(B), (int)(imm))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
-{
-  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
-                 (__v8si) __X,
-                 (__v8si) _mm256_undefined_si256(),
-                 (__mmask8) -1);
-}
+#define _mm_mask_alignr_epi32(W, U, A, B, imm) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+                                    (__v4si)(__m128i)(W))
 
-#define _mm_alignr_epi32(A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
-                                         (__v4si)(__m128i)(B), (int)(imm), \
-                                         (__v4si)_mm_undefined_si128(), \
-                                         (__mmask8)-1); })
+#define _mm_maskz_alignr_epi32(U, A, B, imm) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+                                    (__v4si)_mm_setzero_si128())
 
-#define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
-                                         (__v4si)(__m128i)(B), (int)(imm), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+#define _mm256_alignr_epi32(A, B, imm) \
+  (__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \
+                                    (__v8si)(__m256i)(B), (int)(imm))
 
-#define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
-                                         (__v4si)(__m128i)(B), (int)(imm), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+                                 (__v8si)(__m256i)(W))
 
-#define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
-                                         (__v8si)(__m256i)(B), (int)(imm), \
-                                         (__v8si)_mm256_undefined_si256(), \
-                                         (__mmask8)-1); })
+#define _mm256_maskz_alignr_epi32(U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+                                 (__v8si)_mm256_setzero_si256())
 
-#define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
-                                         (__v8si)(__m256i)(B), (int)(imm), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+#define _mm_alignr_epi64(A, B, imm) \
+  (__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \
+                                    (__v2di)(__m128i)(B), (int)(imm))
 
-#define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
-                                         (__v8si)(__m256i)(B), (int)(imm), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+#define _mm_mask_alignr_epi64(W, U, A, B, imm) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+                                    (__v2di)(__m128i)(W))
 
-#define _mm_alignr_epi64(A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
-                                         (__v2di)(__m128i)(B), (int)(imm), \
-                                         (__v2di)_mm_setzero_di(), \
-                                         (__mmask8)-1); })
+#define _mm_maskz_alignr_epi64(U, A, B, imm) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+                                    (__v2di)_mm_setzero_si128())
 
-#define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
-                                         (__v2di)(__m128i)(B), (int)(imm), \
-                                         (__v2di)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+#define _mm256_alignr_epi64(A, B, imm) \
+  (__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \
+                                    (__v4di)(__m256i)(B), (int)(imm))
 
-#define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
-                                         (__v2di)(__m128i)(B), (int)(imm), \
-                                         (__v2di)_mm_setzero_di(), \
-                                         (__mmask8)(U)); })
+#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+                                 (__v4di)(__m256i)(W))
 
-#define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
-                                         (__v4di)(__m256i)(B), (int)(imm), \
-                                         (__v4di)_mm256_undefined_pd(), \
-                                         (__mmask8)-1); })
+#define _mm256_maskz_alignr_epi64(U, A, B, imm) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+                                 (__v4di)_mm256_setzero_si256())
 
-#define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
-                                         (__v4di)(__m256i)(B), (int)(imm), \
-                                         (__v4di)(__m256i)(W), \
-                                         (__mmask8)(U)); })
-
-#define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
-                                         (__v4di)(__m256i)(B), (int)(imm), \
-                                         (__v4di)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
 {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
@@ -8939,7 +8036,7 @@
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
 {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
@@ -8947,7 +8044,7 @@
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
 {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
@@ -8955,7 +8052,7 @@
                                              (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
 {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
@@ -8963,7 +8060,7 @@
                                              (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
 {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
@@ -8971,7 +8068,7 @@
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
 {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
@@ -8979,7 +8076,7 @@
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
 {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
@@ -8987,7 +8084,7 @@
                                              (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
 {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
@@ -8995,27 +8092,27 @@
                                              (__v8sf)_mm256_setzero_ps());
 }
 
-#define _mm256_mask_shuffle_epi32(W, U, A, I) __extension__({\
+#define _mm256_mask_shuffle_epi32(W, U, A, I) \
   (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                       (__v8si)_mm256_shuffle_epi32((A), (I)), \
-                                      (__v8si)(__m256i)(W)); })
+                                      (__v8si)(__m256i)(W))
 
-#define _mm256_maskz_shuffle_epi32(U, A, I) __extension__({\
+#define _mm256_maskz_shuffle_epi32(U, A, I) \
   (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                       (__v8si)_mm256_shuffle_epi32((A), (I)), \
-                                      (__v8si)_mm256_setzero_si256()); })
+                                      (__v8si)_mm256_setzero_si256())
 
-#define _mm_mask_shuffle_epi32(W, U, A, I) __extension__({\
+#define _mm_mask_shuffle_epi32(W, U, A, I) \
   (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                       (__v4si)_mm_shuffle_epi32((A), (I)), \
-                                      (__v4si)(__m128i)(W)); })
+                                      (__v4si)(__m128i)(W))
 
-#define _mm_maskz_shuffle_epi32(U, A, I) __extension__({\
+#define _mm_maskz_shuffle_epi32(U, A, I) \
   (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                       (__v4si)_mm_shuffle_epi32((A), (I)), \
-                                      (__v4si)_mm_setzero_si128()); })
+                                      (__v4si)_mm_setzero_si128())
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
 {
   return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
@@ -9023,7 +8120,7 @@
               (__v2df) __W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
 {
   return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
@@ -9031,7 +8128,7 @@
               (__v2df) _mm_setzero_pd ());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
 {
   return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
@@ -9039,7 +8136,7 @@
               (__v4df) __W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
 {
   return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
@@ -9047,7 +8144,7 @@
               (__v4df) _mm256_setzero_pd ());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
 {
   return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
@@ -9055,7 +8152,7 @@
              (__v4sf) __W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
 {
   return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
@@ -9063,7 +8160,7 @@
              (__v4sf) _mm_setzero_ps ());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
 {
   return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
@@ -9071,7 +8168,7 @@
              (__v8sf) __W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
 {
   return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
@@ -9079,7 +8176,7 @@
              (__v8sf) _mm256_setzero_ps ());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
 {
   return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
@@ -9087,7 +8184,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
 {
   return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
@@ -9096,7 +8193,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
 {
   return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
@@ -9104,7 +8201,7 @@
                 (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
 {
   return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
@@ -9113,7 +8210,7 @@
                 (__mmask8) __U);
 }
 
-static __inline __m128i __DEFAULT_FN_ATTRS
+static __inline __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A)
 {
   return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
@@ -9121,7 +8218,7 @@
                                                   (__mmask8) __U);
 }
 
-static __inline __m128i __DEFAULT_FN_ATTRS
+static __inline __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A)
 {
   return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
@@ -9129,17 +8226,17 @@
                                                   (__mmask8) __U);
 }
 
-#define _mm_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \
+#define _mm_mask_cvt_roundps_ph(W, U, A, I) \
   (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
                                          (__v8hi)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+                                         (__mmask8)(U))
 
-#define _mm_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \
+#define _mm_maskz_cvt_roundps_ph(U, A, I) \
   (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
                                          (__v8hi)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
+                                         (__mmask8)(U))
 
-static __inline __m128i __DEFAULT_FN_ATTRS
+static __inline __m128i __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A)
 {
   return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
@@ -9147,24 +8244,25 @@
                                                       (__mmask8) __U);
 }
 
-static __inline __m128i __DEFAULT_FN_ATTRS
+static __inline __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A)
 {
   return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
                                                       (__v8hi) _mm_setzero_si128(),
                                                       (__mmask8) __U);
 }
-#define _mm256_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \
+#define _mm256_mask_cvt_roundps_ph(W, U, A, I) \
   (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
                                             (__v8hi)(__m128i)(W), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
-#define _mm256_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \
+#define _mm256_maskz_cvt_roundps_ph(U, A, I) \
   (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
                                             (__v8hi)_mm_setzero_si128(), \
-                                            (__mmask8)(U)); })
+                                            (__mmask8)(U))
 
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif /* __AVX512VLINTRIN_H */
diff --git a/darwin-x86/clang-headers/avx512vlvbmi2intrin.h b/darwin-x86/clang-headers/avx512vlvbmi2intrin.h
new file mode 100644
index 0000000..baaf565
--- /dev/null
+++ b/darwin-x86/clang-headers/avx512vlvbmi2intrin.h
@@ -0,0 +1,751 @@
+/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVBMI2INTRIN_H
+#define __AVX512VLVBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+              (__v8hi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+              (__v16qi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
+{
+  __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
+{
+  __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+              (__v8hi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+              (__v16qi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+              (__v8hi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+              (__v16qi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+              (__v16hi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+              (__v32qi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
+{
+  __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
+{
+  __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+              (__v16hi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+              (__v32qi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+              (__v16hi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+              (__v32qi) _mm256_setzero_si256(),
+              __U);
+}
+
+#define _mm256_shldi_epi64(A, B, I) \
+  (__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
+                                     (__v4di)(__m256i)(B), (int)(I))
+
+#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                    (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+                                    (__v4di)(__m256i)(S))
+
+#define _mm256_maskz_shldi_epi64(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                    (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+                                    (__v4di)_mm256_setzero_si256())
+
+#define _mm_shldi_epi64(A, B, I) \
+  (__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
+                                     (__v2di)(__m128i)(B), (int)(I))
+
+#define _mm_mask_shldi_epi64(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                      (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+                                      (__v2di)(__m128i)(S))
+
+#define _mm_maskz_shldi_epi64(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                      (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+                                      (__v2di)_mm_setzero_si128())
+
+#define _mm256_shldi_epi32(A, B, I) \
+  (__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
+                                     (__v8si)(__m256i)(B), (int)(I))
+
+#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                    (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+                                    (__v8si)(__m256i)(S))
+
+#define _mm256_maskz_shldi_epi32(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                    (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+                                    (__v8si)_mm256_setzero_si256())
+
+#define _mm_shldi_epi32(A, B, I) \
+  (__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
+                                     (__v4si)(__m128i)(B), (int)(I))
+
+#define _mm_mask_shldi_epi32(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+                                      (__v4si)(__m128i)(S))
+
+#define _mm_maskz_shldi_epi32(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+                                      (__v4si)_mm_setzero_si128())
+
+#define _mm256_shldi_epi16(A, B, I) \
+  (__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
+                                     (__v16hi)(__m256i)(B), (int)(I))
+
+#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                   (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+                                   (__v16hi)(__m256i)(S))
+
+#define _mm256_maskz_shldi_epi16(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                   (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+                                   (__v16hi)_mm256_setzero_si256())
+
+#define _mm_shldi_epi16(A, B, I) \
+  (__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
+                                     (__v8hi)(__m128i)(B), (int)(I))
+
+#define _mm_mask_shldi_epi16(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+                                      (__v8hi)(__m128i)(S))
+
+#define _mm_maskz_shldi_epi16(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+                                      (__v8hi)_mm_setzero_si128())
+
+#define _mm256_shrdi_epi64(A, B, I) \
+  (__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
+                                     (__v4di)(__m256i)(B), (int)(I))
+
+#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                    (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+                                    (__v4di)(__m256i)(S))
+
+#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                    (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+                                    (__v4di)_mm256_setzero_si256())
+
+#define _mm_shrdi_epi64(A, B, I) \
+  (__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
+                                     (__v2di)(__m128i)(B), (int)(I))
+
+#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                      (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+                                      (__v2di)(__m128i)(S))
+
+#define _mm_maskz_shrdi_epi64(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                      (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+                                      (__v2di)_mm_setzero_si128())
+
+#define _mm256_shrdi_epi32(A, B, I) \
+  (__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
+                                     (__v8si)(__m256i)(B), (int)(I))
+
+#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                    (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+                                    (__v8si)(__m256i)(S))
+
+#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                    (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+                                    (__v8si)_mm256_setzero_si256())
+
+#define _mm_shrdi_epi32(A, B, I) \
+  (__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
+                                     (__v4si)(__m128i)(B), (int)(I))
+
+#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+                                      (__v4si)(__m128i)(S))
+
+#define _mm_maskz_shrdi_epi32(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+                                      (__v4si)_mm_setzero_si128())
+
+#define _mm256_shrdi_epi16(A, B, I) \
+  (__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
+                                     (__v16hi)(__m256i)(B), (int)(I))
+
+#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                   (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+                                   (__v16hi)(__m256i)(S))
+
+#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                   (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+                                   (__v16hi)_mm256_setzero_si256())
+
+#define _mm_shrdi_epi16(A, B, I) \
+  (__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
+                                     (__v8hi)(__m128i)(B), (int)(I))
+
+#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+                                      (__v8hi)(__m128i)(S))
+
+#define _mm_maskz_shrdi_epi16(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+                                      (__v8hi)_mm_setzero_si128())
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              (__mmask8) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
diff --git a/darwin-x86/clang-headers/avx512vlvnniintrin.h b/darwin-x86/clang-headers/avx512vlvnniintrin.h
new file mode 100644
index 0000000..6238226
--- /dev/null
+++ b/darwin-x86/clang-headers/avx512vlvnniintrin.h
@@ -0,0 +1,223 @@
+/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVNNIINTRIN_H
+#define __AVX512VLVNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
+                                     (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
+                                    (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
+                                     (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
+                                    (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
+                                    (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
+                                        (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
+                                        (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
+                                       (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
+                                       (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
+                                        (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
+                                        (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
+                                       (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
+                                       (__v4si)_mm_setzero_si128());
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
diff --git a/darwin-x86/clang-headers/avx512vnniintrin.h b/darwin-x86/clang-headers/avx512vnniintrin.h
new file mode 100644
index 0000000..620ef5a
--- /dev/null
+++ b/darwin-x86/clang-headers/avx512vnniintrin.h
@@ -0,0 +1,129 @@
+/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VNNIINTRIN_H
+#define __AVX512VNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512)))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
+                                    (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
+                                    (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
+                                   (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
+                                   (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
+                                    (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
+                                    (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
+                                   (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
+                                   (__v16si)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/darwin-x86/clang-headers/avx512vpopcntdqintrin.h b/darwin-x86/clang-headers/avx512vpopcntdqintrin.h
new file mode 100644
index 0000000..c99f594
--- /dev/null
+++ b/darwin-x86/clang-headers/avx512vpopcntdqintrin.h
@@ -0,0 +1,68 @@
+/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQINTRIN_H
+#define __AVX512VPOPCNTDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
+  return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
+  return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/darwin-x86/clang-headers/avx512vpopcntdqvlintrin.h b/darwin-x86/clang-headers/avx512vpopcntdqvlintrin.h
new file mode 100644
index 0000000..681a75f
--- /dev/null
+++ b/darwin-x86/clang-headers/avx512vpopcntdqvlintrin.h
@@ -0,0 +1,105 @@
+/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQVLINTRIN_H
+#define __AVX512VPOPCNTDQVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi64(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
+  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi32(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
+  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi64(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
+  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi32(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
+  return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
diff --git a/darwin-x86/clang-headers/avxintrin.h b/darwin-x86/clang-headers/avxintrin.h
index 86bfdfb..cb15396 100644
--- a/darwin-x86/clang-headers/avxintrin.h
+++ b/darwin-x86/clang-headers/avxintrin.h
@@ -50,14 +50,15 @@
 typedef long long __m256i __attribute__((__vector_size__(32)));
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
 
 /* Arithmetic */
-/// \brief Adds two 256-bit vectors of [4 x double].
+/// Adds two 256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
+/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -71,11 +72,11 @@
   return (__m256d)((__v4df)__a+(__v4df)__b);
 }
 
-/// \brief Adds two 256-bit vectors of [8 x float].
+/// Adds two 256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
+/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -89,11 +90,11 @@
   return (__m256)((__v8sf)__a+(__v8sf)__b);
 }
 
-/// \brief Subtracts two 256-bit vectors of [4 x double].
+/// Subtracts two 256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
+/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing the minuend.
@@ -107,11 +108,11 @@
   return (__m256d)((__v4df)__a-(__v4df)__b);
 }
 
-/// \brief Subtracts two 256-bit vectors of [8 x float].
+/// Subtracts two 256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
+/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the minuend.
@@ -125,12 +126,12 @@
   return (__m256)((__v8sf)__a-(__v8sf)__b);
 }
 
-/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+/// Adds the even-indexed values and subtracts the odd-indexed values of
 ///    two 256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing the left source operand.
@@ -144,12 +145,12 @@
   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
 }
 
-/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+/// Adds the even-indexed values and subtracts the odd-indexed values of
 ///    two 256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the left source operand.
@@ -163,11 +164,11 @@
   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
-/// \brief Divides two 256-bit vectors of [4 x double].
+/// Divides two 256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
+/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing the dividend.
@@ -181,11 +182,11 @@
   return (__m256d)((__v4df)__a/(__v4df)__b);
 }
 
-/// \brief Divides two 256-bit vectors of [8 x float].
+/// Divides two 256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
+/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the dividend.
@@ -199,12 +200,12 @@
   return (__m256)((__v8sf)__a/(__v8sf)__b);
 }
 
-/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
+/// Compares two 256-bit vectors of [4 x double] and returns the greater
 ///    of each pair of values.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
+/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the operands.
@@ -218,12 +219,12 @@
   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
 }
 
-/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
+/// Compares two 256-bit vectors of [8 x float] and returns the greater
 ///    of each pair of values.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
+/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the operands.
@@ -237,12 +238,12 @@
   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
 }
 
-/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
+/// Compares two 256-bit vectors of [4 x double] and returns the lesser
 ///    of each pair of values.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
+/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the operands.
@@ -256,12 +257,12 @@
   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
 }
 
-/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
+/// Compares two 256-bit vectors of [8 x float] and returns the lesser
 ///    of each pair of values.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
+/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the operands.
@@ -275,11 +276,11 @@
   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
 }
 
-/// \brief Multiplies two 256-bit vectors of [4 x double].
+/// Multiplies two 256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
+/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the operands.
@@ -293,11 +294,11 @@
   return (__m256d)((__v4df)__a * (__v4df)__b);
 }
 
-/// \brief Multiplies two 256-bit vectors of [8 x float].
+/// Multiplies two 256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
+/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the operands.
@@ -311,12 +312,12 @@
   return (__m256)((__v8sf)__a * (__v8sf)__b);
 }
 
-/// \brief Calculates the square roots of the values in a 256-bit vector of
+/// Calculates the square roots of the values in a 256-bit vector of
 ///    [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
+/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double].
@@ -328,12 +329,12 @@
   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
 }
 
-/// \brief Calculates the square roots of the values in a 256-bit vector of
+/// Calculates the square roots of the values in a 256-bit vector of
 ///    [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
+/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -345,12 +346,12 @@
   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
 }
 
-/// \brief Calculates the reciprocal square roots of the values in a 256-bit
+/// Calculates the reciprocal square roots of the values in a 256-bit
 ///    vector of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
+/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -362,12 +363,12 @@
   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
 }
 
-/// \brief Calculates the reciprocals of the values in a 256-bit vector of
+/// Calculates the reciprocals of the values in a 256-bit vector of
 ///    [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
+/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -379,7 +380,7 @@
   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
 }
 
-/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
+/// Rounds the values in a 256-bit vector of [4 x double] as specified
 ///    by the byte operand. The source values are rounded to integer values and
 ///    returned as 64-bit double-precision floating-point values.
 ///
@@ -389,29 +390,29 @@
 /// __m256d _mm256_round_pd(__m256d V, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [4 x double].
 /// \param M
-///    An integer value that specifies the rounding operation.
-///    Bits [7:4] are reserved.
-///    Bit [3] is a precision exception value:
-///    0: A normal PE exception is used.
-///    1: The PE field is not updated.
-///    Bit [2] is the rounding control source:
-///    0: Use bits [1:0] of M.
-///    1: Use the current MXCSR setting.
-///    Bits [1:0] contain the rounding control definition:
-///    00: Nearest.
-///    01: Downward (toward negative infinity).
-///    10: Upward (toward positive infinity).
-///    11: Truncated.
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used. \n
+///      1: The PE field is not updated. \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M. \n
+///      1: Use the current MXCSR setting. \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest. \n
+///      01: Downward (toward negative infinity). \n
+///      10: Upward (toward positive infinity). \n
+///      11: Truncated.
 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
-#define _mm256_round_pd(V, M) __extension__ ({ \
-    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
+#define _mm256_round_pd(V, M) \
+    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
 
-/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
+/// Rounds the values stored in a 256-bit vector of [8 x float] as
 ///    specified by the byte operand. The source values are rounded to integer
 ///    values and returned as floating-point values.
 ///
@@ -421,29 +422,29 @@
 /// __m256 _mm256_round_ps(__m256 V, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [8 x float].
 /// \param M
-///    An integer value that specifies the rounding operation.
-///    Bits [7:4] are reserved.
-///    Bit [3] is a precision exception value:
-///    0: A normal PE exception is used.
-///    1: The PE field is not updated.
-///    Bit [2] is the rounding control source:
-///    0: Use bits [1:0] of M.
-///    1: Use the current MXCSR setting.
-///    Bits [1:0] contain the rounding control definition:
-///    00: Nearest.
-///    01: Downward (toward negative infinity).
-///    10: Upward (toward positive infinity).
-///    11: Truncated.
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used. \n
+///      1: The PE field is not updated. \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M. \n
+///      1: Use the current MXCSR setting. \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest. \n
+///      01: Downward (toward negative infinity). \n
+///      10: Upward (toward positive infinity). \n
+///      11: Truncated.
 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
-#define _mm256_round_ps(V, M) __extension__ ({ \
-  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
+#define _mm256_round_ps(V, M) \
+  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
 
-/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
+/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
 ///    source values are rounded up to integer values and returned as 64-bit
 ///    double-precision floating-point values.
 ///
@@ -453,14 +454,14 @@
 /// __m256d _mm256_ceil_pd(__m256d V);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the rounded up values.
 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
 
-/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
+/// Rounds down the values stored in a 256-bit vector of [4 x double].
 ///    The source values are rounded down to integer values and returned as
 ///    64-bit double-precision floating-point values.
 ///
@@ -470,7 +471,7 @@
 /// __m256d _mm256_floor_pd(__m256d V);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [4 x double].
@@ -478,7 +479,7 @@
 ///    values.
 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
 
-/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
+/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
 ///    source values are rounded up to integer values and returned as
 ///    floating-point values.
 ///
@@ -488,14 +489,14 @@
 /// __m256 _mm256_ceil_ps(__m256 V);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the rounded up values.
 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
 
-/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
+/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
 ///    source values are rounded down to integer values and returned as
 ///    floating-point values.
 ///
@@ -505,7 +506,7 @@
 /// __m256 _mm256_floor_ps(__m256 V);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 ///
 /// \param V
 ///    A 256-bit vector of [8 x float].
@@ -513,11 +514,11 @@
 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
 
 /* Logical */
-/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
+/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
+/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -531,11 +532,11 @@
   return (__m256d)((__v4du)__a & (__v4du)__b);
 }
 
-/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
+/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
+/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -549,12 +550,12 @@
   return (__m256)((__v8su)__a & (__v8su)__b);
 }
 
-/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
+/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
 ///    the one's complement of the values contained in the first source operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
+/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing the left source operand. The
@@ -570,12 +571,12 @@
   return (__m256d)(~(__v4du)__a & (__v4du)__b);
 }
 
-/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
+/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
 ///    the one's complement of the values contained in the first source operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
+/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the left source operand. The
@@ -591,11 +592,11 @@
   return (__m256)(~(__v8su)__a & (__v8su)__b);
 }
 
-/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
+/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
+/// This intrinsic corresponds to the <c> VORPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -609,11 +610,11 @@
   return (__m256d)((__v4du)__a | (__v4du)__b);
 }
 
-/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
+/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
+/// This intrinsic corresponds to the <c> VORPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -627,11 +628,11 @@
   return (__m256)((__v8su)__a | (__v8su)__b);
 }
 
-/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
+/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
+/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -645,11 +646,11 @@
   return (__m256d)((__v4du)__a ^ (__v4du)__b);
 }
 
-/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
+/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -664,12 +665,12 @@
 }
 
 /* Horizontal arithmetic */
-/// \brief Horizontally adds the adjacent pairs of values contained in two
+/// Horizontally adds the adjacent pairs of values contained in two
 ///    256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -687,12 +688,12 @@
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 }
 
-/// \brief Horizontally adds the adjacent pairs of values contained in two
+/// Horizontally adds the adjacent pairs of values contained in two
 ///    256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -710,12 +711,12 @@
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+/// Horizontally subtracts the adjacent pairs of values contained in two
 ///    256-bit vectors of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double] containing one of the source operands.
@@ -733,12 +734,12 @@
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+/// Horizontally subtracts the adjacent pairs of values contained in two
 ///    256-bit vectors of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing one of the source operands.
@@ -757,66 +758,66 @@
 }
 
 /* Vector permutations */
-/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
+/// Copies the values in a 128-bit vector of [2 x double] as specified
 ///    by the 128-bit integer vector operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \param __c
 ///    A 128-bit integer vector operand specifying how the values are to be
-///    copied.
-///    Bit [1]:
-///    0: Bits [63:0] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    Bit [65]:
-///    0: Bits [63:0] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [127:64] of the
-///    returned vector.
+///    copied. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [65]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline __m128d __DEFAULT_FN_ATTRS
+static __inline __m128d __DEFAULT_FN_ATTRS128
 _mm_permutevar_pd(__m128d __a, __m128i __c)
 {
   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
 }
 
-/// \brief Copies the values in a 256-bit vector of [4 x double] as
-///    specified by the 256-bit integer vector operand.
+/// Copies the values in a 256-bit vector of [4 x double] as specified
+///    by the 256-bit integer vector operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double].
 /// \param __c
 ///    A 256-bit integer vector operand specifying how the values are to be
-///    copied.
-///    Bit [1]:
-///    0: Bits [63:0] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    Bit [65]:
-///    0: Bits [63:0] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    Bit [129]:
-///    0: Bits [191:128] of the source are copied to bits [191:128] of the
-///    returned vector.
-///    1: Bits [255:192] of the source are copied to bits [191:128] of the
-///    returned vector.
-///    Bit [193]:
-///    0: Bits [191:128] of the source are copied to bits [255:192] of the
-///    returned vector.
-///    1: Bits [255:192] of the source are copied to bits [255:192] of the
+///    copied. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [65]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///    Bit [129]: \n
+///      0: Bits [191:128] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///    Bit [193]: \n
+///      0: Bits [191:128] of the source are copied to bits [255:192] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [255:192] of the
 ///    returned vector.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 static __inline __m256d __DEFAULT_FN_ATTRS
@@ -825,145 +826,144 @@
   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
 }
 
-/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
+/// Copies the values stored in a 128-bit vector of [4 x float] as
 ///    specified by the 128-bit integer vector operand.
-///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \param __c
 ///    A 128-bit integer vector operand specifying how the values are to be
-///    copied.
-///    Bits [1:0]:
-///    00: Bits [31:0] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    Bits [33:32]:
-///    00: Bits [31:0] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    Bits [65:64]:
-///    00: Bits [31:0] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    Bits [97:96]:
-///    00: Bits [31:0] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [127:96] of the
-///    returned vector.
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [33:32]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [65:64]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [97:96]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline __m128 __DEFAULT_FN_ATTRS
+static __inline __m128 __DEFAULT_FN_ATTRS128
 _mm_permutevar_ps(__m128 __a, __m128i __c)
 {
   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
 }
 
-/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
+/// Copies the values stored in a 256-bit vector of [8 x float] as
 ///    specified by the 256-bit integer vector operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
 /// \param __c
 ///    A 256-bit integer vector operand specifying how the values are to be
-///    copied.
-///    Bits [1:0]:
-///    00: Bits [31:0] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    Bits [33:32]:
-///    00: Bits [31:0] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    Bits [65:64]:
-///    00: Bits [31:0] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    Bits [97:96]:
-///    00: Bits [31:0] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    Bits [129:128]:
-///    00: Bits [159:128] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    Bits [161:160]:
-///    00: Bits [159:128] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    Bits [193:192]:
-///    00: Bits [159:128] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    Bits [225:224]:
-///    00: Bits [159:128] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [255:224] of the
-///    returned vector.
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [33:32]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [65:64]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [97:96]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///    Bits [129:128]: \n
+///      00: Bits [159:128] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///    Bits [161:160]: \n
+///      00: Bits [159:128] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///    Bits [193:192]: \n
+///      00: Bits [159:128] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///    Bits [225:224]: \n
+///      00: Bits [159:128] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [255:224] of the
+///          returned vector.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_permutevar_ps(__m256 __a, __m256i __c)
@@ -971,8 +971,8 @@
   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
 }
 
-/// \brief Copies the values in a 128-bit vector of [2 x double] as
-///    specified by the immediate integer operand.
+/// Copies the values in a 128-bit vector of [2 x double] as specified
+///    by the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -980,30 +980,29 @@
 /// __m128d _mm_permute_pd(__m128d A, const int C);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 ///
 /// \param A
 ///    A 128-bit vector of [2 x double].
 /// \param C
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bit [0]:
-///    0: Bits [63:0] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    Bit [1]:
-///    0: Bits [63:0] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [127:64] of the
-///    returned vector.
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bit [0]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
-#define _mm_permute_pd(A, C) __extension__ ({ \
-  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
-                                   (__v2df)_mm_undefined_pd(), \
-                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
+#define _mm_permute_pd(A, C) \
+  (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
 
-/// \brief Copies the values in a 256-bit vector of [4 x double] as
-///    specified by the immediate integer operand.
+/// Copies the values in a 256-bit vector of [4 x double] as specified by
+///    the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1011,43 +1010,39 @@
 /// __m256d _mm256_permute_pd(__m256d A, const int C);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 ///
 /// \param A
 ///    A 256-bit vector of [4 x double].
 /// \param C
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bit [0]:
-///    0: Bits [63:0] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [63:0] of the
-///    returned vector.
-///    Bit [1]:
-///    0: Bits [63:0] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    1: Bits [127:64] of the source are copied to bits [127:64] of the
-///    returned vector.
-///    Bit [2]:
-///    0: Bits [191:128] of the source are copied to bits [191:128] of the
-///    returned vector.
-///    1: Bits [255:192] of the source are copied to bits [191:128] of the
-///    returned vector.
-///    Bit [3]:
-///    0: Bits [191:128] of the source are copied to bits [255:192] of the
-///    returned vector.
-///    1: Bits [255:192] of the source are copied to bits [255:192] of the
-///    returned vector.
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bit [0]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///    Bit [2]: \n
+///      0: Bits [191:128] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///    Bit [3]: \n
+///      0: Bits [191:128] of the source are copied to bits [255:192] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [255:192] of the
+///         returned vector.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_permute_pd(A, C) __extension__ ({ \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
-                                   (__v4df)_mm256_undefined_pd(), \
-                                   0 + (((C) >> 0) & 0x1), \
-                                   0 + (((C) >> 1) & 0x1), \
-                                   2 + (((C) >> 2) & 0x1), \
-                                   2 + (((C) >> 3) & 0x1)); })
+#define _mm256_permute_pd(A, C) \
+  (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
 
-/// \brief Copies the values in a 128-bit vector of [4 x float] as
-///    specified by the immediate integer operand.
+/// Copies the values in a 128-bit vector of [4 x float] as specified by
+///    the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1055,57 +1050,55 @@
 /// __m128 _mm_permute_ps(__m128 A, const int C);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 ///
 /// \param A
 ///    A 128-bit vector of [4 x float].
 /// \param C
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bits [1:0]:
-///    00: Bits [31:0] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    Bits [3:2]:
-///    00: Bits [31:0] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    Bits [5:4]:
-///    00: Bits [31:0] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    Bits [7:6]:
-///    00: Bits [31:0] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [127:96] of the
-///    returned vector.
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
-#define _mm_permute_ps(A, C) __extension__ ({ \
-  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
-                                  (__v4sf)_mm_undefined_ps(), \
-                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
-                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+#define _mm_permute_ps(A, C) \
+  (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
 
-/// \brief Copies the values in a 256-bit vector of [8 x float] as
-///    specified by the immediate integer operand.
+/// Copies the values in a 256-bit vector of [8 x float] as specified by
+///    the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1113,98 +1106,90 @@
 /// __m256 _mm256_permute_ps(__m256 A, const int C);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 ///
 /// \param A
 ///    A 256-bit vector of [8 x float].
 /// \param C
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bits [1:0]:
-///    00: Bits [31:0] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [31:0] of the
-///    returned vector.
-///    Bits [3:2]:
-///    00: Bits [31:0] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [63:32] of the
-///    returned vector.
-///    Bits [5:4]:
-///    00: Bits [31:0] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [95:64] of the
-///    returned vector.
-///    Bits [7:6]:
-///    00: Bits [31:0] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    01: Bits [63:32] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    10: Bits [95:64] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    11: Bits [127:96] of the source are copied to bits [127:96] of the
-///    returned vector.
-///    Bits [1:0]:
-///    00: Bits [159:128] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [159:128] of the
-///    returned vector.
-///    Bits [3:2]:
-///    00: Bits [159:128] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [191:160] of the
-///    returned vector.
-///    Bits [5:4]:
-///    00: Bits [159:128] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [223:192] of the
-///    returned vector.
-///    Bits [7:6]:
-///    00: Bits [159:128] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    01: Bits [191:160] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    10: Bits [223:192] of the source are copied to bits [255:224] of the
-///    returned vector.
-///    11: Bits [255:224] of the source are copied to bits [255:224] of the
-///    returned vector.
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///    Bits [1:0]: \n
+///      00: Bits [159:128] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [159:128] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [159:128] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [159:128] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [255:224] of the
+///          returned vector.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_permute_ps(A, C) __extension__ ({ \
-  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
-                                  (__v8sf)_mm256_undefined_ps(), \
-                                  0 + (((C) >> 0) & 0x3), \
-                                  0 + (((C) >> 2) & 0x3), \
-                                  0 + (((C) >> 4) & 0x3), \
-                                  0 + (((C) >> 6) & 0x3), \
-                                  4 + (((C) >> 0) & 0x3), \
-                                  4 + (((C) >> 2) & 0x3), \
-                                  4 + (((C) >> 4) & 0x3), \
-                                  4 + (((C) >> 6) & 0x3)); })
+#define _mm256_permute_ps(A, C) \
+  (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
 
-/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
+/// Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [4 x double], as specified by the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -1213,7 +1198,7 @@
 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit vector of [4 x double].
@@ -1221,31 +1206,31 @@
 ///    A 256-bit vector of [4 x double.
 /// \param M
 ///    An immediate integer operand specifying how the values are to be
-///    permuted.
-///    Bits [1:0]:
-///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    Bits [5:4]:
-///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
-///    destination.
+///    permuted. \n
+///    Bits [1:0]: \n
+///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///          destination. \n
+///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///          destination. \n
+///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///          destination. \n
+///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///          destination. \n
+///    Bits [5:4]: \n
+///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///          destination. \n
+///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///          destination. \n
+///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///          destination. \n
+///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+///          destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
+#define _mm256_permute2f128_pd(V1, V2, M) \
   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
-                                           (__v4df)(__m256d)(V2), (M)); })
+                                           (__v4df)(__m256d)(V2), (int)(M))
 
-/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
+/// Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [8 x float], as specified by the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -1254,7 +1239,7 @@
 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit vector of [8 x float].
@@ -1262,31 +1247,31 @@
 ///    A 256-bit vector of [8 x float].
 /// \param M
 ///    An immediate integer operand specifying how the values are to be
-///    permuted.
-///    Bits [1:0]:
-///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    Bits [5:4]:
-///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+///    permuted. \n
+///    Bits [1:0]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    Bits [5:4]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
 ///    destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+#define _mm256_permute2f128_ps(V1, V2, M) \
   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
-                                          (__v8sf)(__m256)(V2), (M)); })
+                                          (__v8sf)(__m256)(V2), (int)(M))
 
-/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
+/// Permutes 128-bit data values stored in two 256-bit integer vectors,
 ///    as specified by the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -1295,7 +1280,7 @@
 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit integer vector.
@@ -1303,31 +1288,31 @@
 ///    A 256-bit integer vector.
 /// \param M
 ///    An immediate integer operand specifying how the values are to be copied.
-///    Bits [1:0]:
-///    00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-///    destination.
-///    Bits [5:4]:
-///    00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-///    destination.
-///    10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-///    destination.
-///    11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+///    Bits [1:0]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    Bits [5:4]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
 ///    destination.
 /// \returns A 256-bit integer vector containing the copied values.
-#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+#define _mm256_permute2f128_si256(V1, V2, M) \
   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
-                                           (__v8si)(__m256i)(V2), (M)); })
+                                           (__v8si)(__m256i)(V2), (int)(M))
 
 /* Vector Blend */
-/// \brief Merges 64-bit double-precision data values stored in either of the
+/// Merges 64-bit double-precision data values stored in either of the
 ///    two 256-bit vectors of [4 x double], as specified by the immediate
 ///    integer operand.
 ///
@@ -1337,7 +1322,7 @@
 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
+/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit vector of [4 x double].
@@ -1347,19 +1332,15 @@
 ///    An immediate integer operand, with mask bits [3:0] specifying how the
 ///    values are to be copied. The position of the mask bit corresponds to the
 ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
-///    element in operand V1 is copied to the same position in the destination.
-///    When a mask bit is 1, the corresponding 64-bit element in operand V2 is
-///    copied to the same position in the destination.
+///    element in operand \a V1 is copied to the same position in the
+///    destination. When a mask bit is 1, the corresponding 64-bit element in
+///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
-                                   (__v4df)(__m256d)(V2), \
-                                   (((M) & 0x01) ? 4 : 0), \
-                                   (((M) & 0x02) ? 5 : 1), \
-                                   (((M) & 0x04) ? 6 : 2), \
-                                   (((M) & 0x08) ? 7 : 3)); })
+#define _mm256_blend_pd(V1, V2, M) \
+  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
+                                     (__v4df)(__m256d)(V2), (int)(M))
 
-/// \brief Merges 32-bit single-precision data values stored in either of the
+/// Merges 32-bit single-precision data values stored in either of the
 ///    two 256-bit vectors of [8 x float], as specified by the immediate
 ///    integer operand.
 ///
@@ -1369,7 +1350,7 @@
 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
+/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
 ///
 /// \param V1
 ///    A 256-bit vector of [8 x float].
@@ -1379,29 +1360,21 @@
 ///    An immediate integer operand, with mask bits [7:0] specifying how the
 ///    values are to be copied. The position of the mask bit corresponds to the
 ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
-///    element in operand V1 is copied to the same position in the destination.
-///    When a mask bit is 1, the corresponding 32-bit element in operand V2 is
-///    copied to the same position in the destination.
+///    element in operand \a V1 is copied to the same position in the
+///    destination. When a mask bit is 1, the corresponding 32-bit element in
+///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
-  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
-                                  (__v8sf)(__m256)(V2), \
-                                  (((M) & 0x01) ?  8 : 0), \
-                                  (((M) & 0x02) ?  9 : 1), \
-                                  (((M) & 0x04) ? 10 : 2), \
-                                  (((M) & 0x08) ? 11 : 3), \
-                                  (((M) & 0x10) ? 12 : 4), \
-                                  (((M) & 0x20) ? 13 : 5), \
-                                  (((M) & 0x40) ? 14 : 6), \
-                                  (((M) & 0x80) ? 15 : 7)); })
+#define _mm256_blend_ps(V1, V2, M) \
+  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
+                                    (__v8sf)(__m256)(V2), (int)(M))
 
-/// \brief Merges 64-bit double-precision data values stored in either of the
+/// Merges 64-bit double-precision data values stored in either of the
 ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
 ///    operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
+/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double].
@@ -1411,9 +1384,9 @@
 ///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
 ///    how the values are to be copied. The position of the mask bit corresponds
 ///    to the most significant bit of a copied value. When a mask bit is 0, the
-///    corresponding 64-bit element in operand __a is copied to the same
+///    corresponding 64-bit element in operand \a __a is copied to the same
 ///    position in the destination. When a mask bit is 1, the corresponding
-///    64-bit element in operand __b is copied to the same position in the
+///    64-bit element in operand \a __b is copied to the same position in the
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 static __inline __m256d __DEFAULT_FN_ATTRS
@@ -1423,13 +1396,13 @@
     (__v4df)__a, (__v4df)__b, (__v4df)__c);
 }
 
-/// \brief Merges 32-bit single-precision data values stored in either of the
+/// Merges 32-bit single-precision data values stored in either of the
 ///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
 ///    operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
+/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -1439,9 +1412,9 @@
 ///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
 ///    and 31 specifying how the values are to be copied. The position of the
 ///    mask bit corresponds to the most significant bit of a copied value. When
-///    a mask bit is 0, the corresponding 32-bit element in operand __a is
+///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
 ///    copied to the same position in the destination. When a mask bit is 1, the
-///    corresponding 32-bit element in operand __b is copied to the same
+///    corresponding 32-bit element in operand \a __b is copied to the same
 ///    position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 static __inline __m256 __DEFAULT_FN_ATTRS
@@ -1452,15 +1425,16 @@
 }
 
 /* Vector Dot Product */
-/// \brief Computes two dot products in parallel, using the lower and upper
+/// Computes two dot products in parallel, using the lower and upper
 ///    halves of two [8 x float] vectors as input to the two computations, and
 ///    returning the two dot products in the lower and upper halves of the
-///    [8 x float] result. The immediate integer operand controls which
-///    input elements will contribute to the dot product, and where the final
-///    results are returned. In general, for each dot product, the four
-///    corresponding elements of the input vectors are multiplied; the first
-///    two and second two products are summed, then the two sums are added to
-///    form the final result.
+///    [8 x float] result.
+///
+///    The immediate integer operand controls which input elements will
+///    contribute to the dot product, and where the final results are returned.
+///    In general, for each dot product, the four corresponding elements of the
+///    input vectors are multiplied; the first two and second two products are
+///    summed, then the two sums are added to form the final result.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1468,7 +1442,7 @@
 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
+/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
 ///
 /// \param V1
 ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
@@ -1488,21 +1462,22 @@
 ///    is set to zero. The bitmask is applied in the same way to each of the
 ///    two parallel dot product computations.
 /// \returns A 256-bit vector of [8 x float] containing the two dot products.
-#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
+#define _mm256_dp_ps(V1, V2, M) \
   (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
-                                 (__v8sf)(__m256)(V2), (M)); })
+                                 (__v8sf)(__m256)(V2), (M))
 
 /* Vector shuffle */
-/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
-///    specified by the immediate value operand. The four selected elements in
-///    each operand are copied to the destination according to the bits
-///    specified in the immediate operand. The selected elements from the first
-///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
-///    destination, and the selected elements from the second 256-bit operand
-///    are copied to bits [127:64] and bits [255:192] of the destination. For
-///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
-///    the 256-bit destination vector would contain the following values: b[7],
-///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
+/// Selects 8 float values from the 256-bit operands of [8 x float], as
+///    specified by the immediate value operand.
+///
+///    The four selected elements in each operand are copied to the destination
+///    according to the bits specified in the immediate operand. The selected
+///    elements from the first 256-bit operand are copied to bits [63:0] and
+///    bits [191:128] of the destination, and the selected elements from the
+///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
+///    the destination. For example, if bits [7:0] of the immediate operand
+///    contain a value of 0xFF, the 256-bit destination vector would contain the
+///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1510,7 +1485,7 @@
 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
+/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
 ///
 /// \param a
 ///    A 256-bit vector of [8 x float]. The four selected elements in this
@@ -1522,44 +1497,38 @@
 ///    destination, according to the bits specified in the immediate operand.
 /// \param mask
 ///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from a and b. Bits [3:0] specify the values copied from operand a.
-///    Bits [7:4] specify the values copied from operand b.
+///    copy from \a a and \a b \n.
+///    Bits [3:0] specify the values copied from operand \a a. \n
+///    Bits [7:4] specify the values copied from operand \a b. \n
 ///    The destinations within the 256-bit destination are assigned values as
-///    follows, according to the bit value assignments described below:
+///    follows, according to the bit value assignments described below: \n
 ///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
-///    destination.
+///    destination. \n
 ///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
-///    destination.
+///    destination. \n
 ///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
-///    destination.
+///    destination. \n
 ///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
-///    the destination.
-///    Bit value assignments:
-///    00: Bits [31:0] and [159:128] are copied from the selected operand.
-///    01: Bits [63:32] and [191:160] are copied from the selected operand.
-///    10: Bits [95:64] and [223:192] are copied from the selected operand.
+///    the destination. \n
+///    Bit value assignments: \n
+///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
+///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
+///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
 ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
-#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
-  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
-                                  (__v8sf)(__m256)(b), \
-                                  0  + (((mask) >> 0) & 0x3), \
-                                  0  + (((mask) >> 2) & 0x3), \
-                                  8  + (((mask) >> 4) & 0x3), \
-                                  8  + (((mask) >> 6) & 0x3), \
-                                  4  + (((mask) >> 0) & 0x3), \
-                                  4  + (((mask) >> 2) & 0x3), \
-                                  12 + (((mask) >> 4) & 0x3), \
-                                  12 + (((mask) >> 6) & 0x3)); })
+#define _mm256_shuffle_ps(a, b, mask) \
+  (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
+                                   (__v8sf)(__m256)(b), (int)(mask))
 
-/// \brief Selects four double-precision values from the 256-bit operands of
-///    [4 x double], as specified by the immediate value operand. The selected
-///    elements from the first 256-bit operand are copied to bits [63:0] and
-///    bits [191:128] in the destination, and the selected elements from the
-///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
-///    the destination. For example, if bits [3:0] of the immediate operand
-///    contain a value of 0xF, the 256-bit destination vector would contain the
-///    following values: b[3], a[3], b[1], a[1].
+/// Selects four double-precision values from the 256-bit operands of
+///    [4 x double], as specified by the immediate value operand.
+///
+///    The selected elements from the first 256-bit operand are copied to bits
+///    [63:0] and bits [191:128] in the destination, and the selected elements
+///    from the second 256-bit operand are copied to bits [127:64] and bits
+///    [255:192] in the destination. For example, if bits [3:0] of the immediate
+///    operand contain a value of 0xF, the 256-bit destination vector would
+///    contain the following values: b[3], a[3], b[1], a[1].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1567,7 +1536,7 @@
 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
+/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
 ///
 /// \param a
 ///    A 256-bit vector of [4 x double].
@@ -1575,31 +1544,27 @@
 ///    A 256-bit vector of [4 x double].
 /// \param mask
 ///    An immediate value containing 8-bit values specifying which elements to
-///    copy from a and b:
-///    Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
-///    destination.
-///    Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
-///    destination.
-///    Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
-///    destination.
-///    Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
-///    destination.
-///    Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
-///    destination.
-///    Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
-///    destination.
-///    Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
-///    destination.
-///    Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
+///    copy from \a a and \a b: \n
+///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
+///    destination. \n
+///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
+///    destination. \n
+///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
+///    destination. \n
+///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
+///    destination. \n
+///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
+///    destination. \n
+///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
+///    destination. \n
+///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
+///    destination. \n
+///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
-#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
-                                   (__v4df)(__m256d)(b), \
-                                   0 + (((mask) >> 0) & 0x1), \
-                                   4 + (((mask) >> 1) & 0x1), \
-                                   2 + (((mask) >> 2) & 0x1), \
-                                   6 + (((mask) >> 3) & 0x1)); })
+#define _mm256_shuffle_pd(a, b, mask) \
+  (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
+                                    (__v4df)(__m256d)(b), (int)(mask))
 
 /* Compare */
 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
@@ -1609,9 +1574,9 @@
 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
-#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
-#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
@@ -1624,10 +1589,10 @@
 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
-#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
-#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
@@ -1635,11 +1600,13 @@
 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
 
-/// \brief Compares each of the corresponding double-precision values of two
+/// Compares each of the corresponding double-precision values of two
 ///    128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand. Returns a [2 x double] vector consisting of
-///    two doubles corresponding to the two comparison results: zero if the
-///    comparison is false, and all 1's if the comparison is true.
+///    immediate integer operand.
+///
+///    Returns a [2 x double] vector consisting of two doubles corresponding to
+///    the two comparison results: zero if the comparison is false, and all 1's
+///    if the comparison is true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1647,7 +1614,7 @@
 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [2 x double].
@@ -1655,27 +1622,51 @@
 ///    A 128-bit vector of [2 x double].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands)
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_pd(a, b, c) __extension__ ({ \
+#define _mm_cmp_pd(a, b, c) \
   (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
-                                (__v2df)(__m128d)(b), (c)); })
+                                (__v2df)(__m128d)(b), (c))
 
-/// \brief Compares each of the corresponding values of two 128-bit vectors of
+/// Compares each of the corresponding values of two 128-bit vectors of
 ///    [4 x float], using the operation specified by the immediate integer
-///    operand. Returns a [4 x float] vector consisting of four floats
-///    corresponding to the four comparison results: zero if the comparison is
-///    false, and all 1's if the comparison is true.
+///    operand.
+///
+///    Returns a [4 x float] vector consisting of four floats corresponding to
+///    the four comparison results: zero if the comparison is false, and all 1's
+///    if the comparison is true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1683,7 +1674,7 @@
 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [4 x float].
@@ -1691,27 +1682,51 @@
 ///    A 128-bit vector of [4 x float].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                       (swapped operands)
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ps(a, b, c) __extension__ ({ \
+#define _mm_cmp_ps(a, b, c) \
   (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
-                               (__v4sf)(__m128)(b), (c)); })
+                               (__v4sf)(__m128)(b), (c))
 
-/// \brief Compares each of the corresponding double-precision values of two
+/// Compares each of the corresponding double-precision values of two
 ///    256-bit vectors of [4 x double], using the operation specified by the
-///    immediate integer operand. Returns a [4 x double] vector consisting of
-///    four doubles corresponding to the four comparison results: zero if the
-///    comparison is false, and all 1's if the comparison is true.
+///    immediate integer operand.
+///
+///    Returns a [4 x double] vector consisting of four doubles corresponding to
+///    the four comparison results: zero if the comparison is false, and all 1's
+///    if the comparison is true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1719,7 +1734,7 @@
 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
 ///
 /// \param a
 ///    A 256-bit vector of [4 x double].
@@ -1727,27 +1742,51 @@
 ///    A 256-bit vector of [4 x double].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands)
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
-#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
+#define _mm256_cmp_pd(a, b, c) \
   (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
-                                   (__v4df)(__m256d)(b), (c)); })
+                                   (__v4df)(__m256d)(b), (c))
 
-/// \brief Compares each of the corresponding values of two 256-bit vectors of
+/// Compares each of the corresponding values of two 256-bit vectors of
 ///    [8 x float], using the operation specified by the immediate integer
-///    operand. Returns a [8 x float] vector consisting of eight floats
-///    corresponding to the eight comparison results: zero if the comparison is
-///    false, and all 1's if the comparison is true.
+///    operand.
+///
+///    Returns a [8 x float] vector consisting of eight floats corresponding to
+///    the eight comparison results: zero if the comparison is false, and all
+///    1's if the comparison is true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1755,7 +1794,7 @@
 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
 ///
 /// \param a
 ///    A 256-bit vector of [8 x float].
@@ -1763,26 +1802,50 @@
 ///    A 256-bit vector of [8 x float].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                       (swapped operands)
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
-#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
+#define _mm256_cmp_ps(a, b, c) \
   (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
-                                  (__v8sf)(__m256)(b), (c)); })
+                                  (__v8sf)(__m256)(b), (c))
 
-/// \brief Compares each of the corresponding scalar double-precision values of
+/// Compares each of the corresponding scalar double-precision values of
 ///    two 128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand. If the result is true, all 64 bits of the
-///    destination vector are set; otherwise they are cleared.
+///    immediate integer operand.
+///
+///    If the result is true, all 64 bits of the destination vector are set;
+///    otherwise they are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1790,7 +1853,7 @@
 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
+/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [2 x double].
@@ -1798,26 +1861,50 @@
 ///    A 128-bit vector of [2 x double].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                       (swapped operands)
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_sd(a, b, c) __extension__ ({ \
+#define _mm_cmp_sd(a, b, c) \
   (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
-                                (__v2df)(__m128d)(b), (c)); })
+                                (__v2df)(__m128d)(b), (c))
 
-/// \brief Compares each of the corresponding scalar values of two 128-bit
+/// Compares each of the corresponding scalar values of two 128-bit
 ///    vectors of [4 x float], using the operation specified by the immediate
-///    integer operand. If the result is true, all 32 bits of the destination
-///    vector are set; otherwise they are cleared.
+///    integer operand.
+///
+///    If the result is true, all 32 bits of the destination vector are set;
+///    otherwise they are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1825,7 +1912,7 @@
 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
+/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [4 x float].
@@ -1833,29 +1920,51 @@
 ///    A 128-bit vector of [4 x float].
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use:
-///    00h, 08h, 10h, 18h: Equal
-///    01h, 09h, 11h, 19h: Less than
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-///                        operands)
-///    03h, 0Bh, 13h, 1Bh: Unordered
-///    04h, 0Ch, 14h, 1Ch: Not equal
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                       (swapped operands)
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ss(a, b, c) __extension__ ({ \
+#define _mm_cmp_ss(a, b, c) \
   (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
-                               (__v4sf)(__m128)(b), (c)); })
+                               (__v4sf)(__m128)(b), (c))
 
-/// \brief Takes a [8 x i32] vector and returns the vector element value
+/// Takes a [8 x i32] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-///   EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x i32].
@@ -1864,20 +1973,16 @@
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of extended
 ///    packed data.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_extract_epi32(__m256i __a, const int __imm)
-{
-  __v8si __b = (__v8si)__a;
-  return __b[__imm & 7];
-}
+#define _mm256_extract_epi32(X, N) \
+  (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
 
-/// \brief Takes a [16 x i16] vector and returns the vector element value
+/// Takes a [16 x i16] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-///    EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 256-bit integer vector of [16 x i16].
@@ -1886,20 +1991,17 @@
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
 ///    packed data.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_extract_epi16(__m256i __a, const int __imm)
-{
-  __v16hi __b = (__v16hi)__a;
-  return (unsigned short)__b[__imm & 15];
-}
+#define _mm256_extract_epi16(X, N) \
+  (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
+                                                    (int)(N))
 
-/// \brief Takes a [32 x i8] vector and returns the vector element value
+/// Takes a [32 x i8] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-///    EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 256-bit integer vector of [32 x i8].
@@ -1908,21 +2010,18 @@
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
 ///    packed data.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_extract_epi8(__m256i __a, const int __imm)
-{
-  __v32qi __b = (__v32qi)__a;
-  return (unsigned char)__b[__imm & 31];
-}
+#define _mm256_extract_epi8(X, N) \
+  (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
+                                                   (int)(N))
 
 #ifdef __x86_64__
-/// \brief Takes a [4 x i64] vector and returns the vector element value
+/// Takes a [4 x i64] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-///    EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 256-bit integer vector of [4 x i64].
@@ -1931,22 +2030,18 @@
 ///    element is extracted and returned.
 /// \returns A 64-bit integer containing the extracted 64 bits of extended
 ///    packed data.
-static __inline long long  __DEFAULT_FN_ATTRS
-_mm256_extract_epi64(__m256i __a, const int __imm)
-{
-  __v4di __b = (__v4di)__a;
-  return __b[__imm & 3];
-}
+#define _mm256_extract_epi64(X, N) \
+  (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
 #endif
 
-/// \brief Takes a [8 x i32] vector and replaces the vector element value
+/// Takes a [8 x i32] vector and replaces the vector element value
 ///    indexed by the immediate constant operand by a new value. Returns the
 ///    modified vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-///    INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A vector of [8 x i32] to be used by the insert operation.
@@ -1955,25 +2050,21 @@
 /// \param __imm
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-///     with __b.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
-{
-  __v8si __c = (__v8si)__a;
-  __c[__imm & 7] = __b;
-  return (__m256i)__c;
-}
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+#define _mm256_insert_epi32(X, I, N) \
+  (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
+                                       (int)(I), (int)(N))
 
 
-/// \brief Takes a [16 x i16] vector and replaces the vector element value
+/// Takes a [16 x i16] vector and replaces the vector element value
 ///    indexed by the immediate constant operand with a new value. Returns the
 ///    modified vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-///    INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A vector of [16 x i16] to be used by the insert operation.
@@ -1982,24 +2073,20 @@
 /// \param __imm
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-///     with __b.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
-{
-  __v16hi __c = (__v16hi)__a;
-  __c[__imm & 15] = __b;
-  return (__m256i)__c;
-}
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+#define _mm256_insert_epi16(X, I, N) \
+  (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
+                                        (int)(I), (int)(N))
 
-/// \brief Takes a [32 x i8] vector and replaces the vector element value
+/// Takes a [32 x i8] vector and replaces the vector element value
 ///    indexed by the immediate constant operand with a new value. Returns the
 ///    modified vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-///    INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A vector of [32 x i8] to be used by the insert operation.
@@ -2008,25 +2095,21 @@
 /// \param __imm
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-///    with __b.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
-{
-  __v32qi __c = (__v32qi)__a;
-  __c[__imm & 31] = __b;
-  return (__m256i)__c;
-}
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+#define _mm256_insert_epi8(X, I, N) \
+  (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
+                                        (int)(I), (int)(N))
 
 #ifdef __x86_64__
-/// \brief Takes a [4 x i64] vector and replaces the vector element value
+/// Takes a [4 x i64] vector and replaces the vector element value
 ///    indexed by the immediate constant operand with a new value. Returns the
 ///    modified vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-///    INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
 ///
 /// \param __a
 ///    A vector of [4 x i64] to be used by the insert operation.
@@ -2035,23 +2118,19 @@
 /// \param __imm
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-///     with __b.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
-{
-  __v4di __c = (__v4di)__a;
-  __c[__imm & 3] = __b;
-  return (__m256i)__c;
-}
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///     \a __imm with \a __b.
+#define _mm256_insert_epi64(X, I, N) \
+  (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
+                                       (long long)(I), (int)(N))
 #endif
 
 /* Conversion */
-/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
+/// Converts a vector of [4 x i32] into a vector of [4 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector of [4 x i32].
@@ -2062,11 +2141,11 @@
   return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
 }
 
-/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
+/// Converts a vector of [8 x i32] into a vector of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit integer vector.
@@ -2074,15 +2153,15 @@
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_ps(__m256i __a)
 {
-  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
+  return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
 }
 
-/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
 ///    [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [4 x double].
@@ -2093,11 +2172,11 @@
   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
 }
 
-/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
+/// Converts a vector of [8 x float] into a vector of [8 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
 ///
 /// \param __a
 ///    A 256-bit vector of [8 x float].
@@ -2108,36 +2187,98 @@
   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
 }
 
+/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
+///    x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtps_pd(__m128 __a)
 {
   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
 }
 
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+///    x i32], truncating the result by rounding towards zero when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_cvttpd_epi32(__m256d __a)
 {
-  return (__m128i)__builtin_convertvector((__v4df) __a, __v4si);
+  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
 }
 
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+///    x i32]. When a conversion is inexact, the value returned is rounded
+///    according to the rounding control bits in the MXCSR register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_cvtpd_epi32(__m256d __a)
 {
   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
 }
 
+/// Converts a vector of [8 x float] into a vector of [8 x i32],
+///    truncating the result by rounding towards zero when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_cvttps_epi32(__m256 __a)
 {
-  return (__m256i)__builtin_convertvector((__v8sf) __a, __v8si);
+  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
 }
 
+/// Returns the first element of the input vector of [4 x double].
+///
+/// \headerfile <avxintrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 64 bit double containing the first element of the input vector.
 static __inline double __DEFAULT_FN_ATTRS
 _mm256_cvtsd_f64(__m256d __a)
 {
  return __a[0];
 }
 
+/// Returns the first element of the input vector of [8 x i32].
+///
+/// \headerfile <avxintrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x i32].
+/// \returns A 32 bit integer containing the first element of the input vector.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_cvtsi256_si32(__m256i __a)
 {
@@ -2145,6 +2286,16 @@
  return __b[0];
 }
 
+/// Returns the first element of the input vector of [8 x float].
+///
+/// \headerfile <avxintrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 32 bit float containing the first element of the input vector.
 static __inline float __DEFAULT_FN_ATTRS
 _mm256_cvtss_f32(__m256 __a)
 {
@@ -2152,18 +2303,72 @@
 }
 
 /* Vector replicate */
+/// Moves and duplicates odd-indexed values from a 256-bit vector of
+///    [8 x float] to float values in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
+///    the return value. \n
+///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
+///    the return value. \n
+///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
+///    return value. \n
+///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
+///    return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+///    values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_movehdup_ps(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
 }
 
+/// Moves and duplicates even-indexed values from a 256-bit vector of
+///    [8 x float] to float values in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
+///    the return value. \n
+///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
+///    the return value. \n
+///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
+///    return value. \n
+///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
+///    return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+///    values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_moveldup_ps(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
 }
 
+/// Moves and duplicates double-precision floating point values from a
+///    256-bit vector of [4 x double] to double-precision values in a 256-bit
+///    vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double]. \n
+///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
+///    return value. \n
+///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
+///    the return value.
+/// \returns A 256-bit vector of [4 x double] containing the moved and
+///    duplicated values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_movedup_pd(__m256d __a)
 {
@@ -2171,24 +2376,98 @@
 }
 
 /* Unpack and Interleave */
+/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
+///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [127:64] are written to bits [63:0] of the return value. \n
+///    Bits [255:192] are written to bits [191:128] of the return value. \n
+/// \param __b
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the return value. \n
+///    Bits [255:192] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
 }
 
+/// Unpacks the even-indexed vector elements from two 256-bit vectors of
+///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the return value. \n
+///    Bits [191:128] are written to bits [191:128] of the return value.
+/// \param __b
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [63:0] are written to bits [127:64] of the return value. \n
+///    Bits [191:128] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
 }
 
+/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
+///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [95:64] are written to bits [31:0] of the return value. \n
+///    Bits [127:96] are written to bits [95:64] of the return value. \n
+///    Bits [223:192] are written to bits [159:128] of the return value. \n
+///    Bits [255:224] are written to bits [223:192] of the return value.
+/// \param __b
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [95:64] are written to bits [63:32] of the return value. \n
+///    Bits [127:96] are written to bits [127:96] of the return value. \n
+///    Bits [223:192] are written to bits [191:160] of the return value. \n
+///    Bits [255:224] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
 }
 
+/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
+///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [31:0] are written to bits [31:0] of the return value. \n
+///    Bits [63:32] are written to bits [95:64] of the return value. \n
+///    Bits [159:128] are written to bits [159:128] of the return value. \n
+///    Bits [191:160] are written to bits [223:192] of the return value.
+/// \param __b
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [31:0] are written to bits [63:32] of the return value. \n
+///    Bits [63:32] are written to bits [127:96] of the return value. \n
+///    Bits [159:128] are written to bits [191:160] of the return value. \n
+///    Bits [191:160] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
 {
@@ -2196,90 +2475,431 @@
 }
 
 /* Bit Test */
-static __inline int __DEFAULT_FN_ATTRS
+/// Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns the ZF flag in the EFLAGS register.
+static __inline int __DEFAULT_FN_ATTRS128
 _mm_testz_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
 }
 
-static __inline int __DEFAULT_FN_ATTRS
+/// Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns the CF flag in the EFLAGS register.
+static __inline int __DEFAULT_FN_ATTRS128
 _mm_testc_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
 }
 
-static __inline int __DEFAULT_FN_ATTRS
+/// Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS128
 _mm_testnzc_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
 }
 
-static __inline int __DEFAULT_FN_ATTRS
+/// Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS128
 _mm_testz_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
 }
 
-static __inline int __DEFAULT_FN_ATTRS
+/// Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS128
 _mm_testc_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
 }
 
-static __inline int __DEFAULT_FN_ATTRS
+/// Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS128
 _mm_testnzc_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns the ZF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns the CF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns the ZF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns the CF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns the ZF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
 }
 
+/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns the CF flag.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
 }
 
+/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors.
+///
+///    The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_si256(__m256i __a, __m256i __b)
 {
@@ -2287,12 +2907,36 @@
 }
 
 /* Vector extract sign mask */
+/// Extracts the sign bits of double-precision floating point elements
+///    in a 256-bit vector of [4 x double] and writes them to the lower order
+///    bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the double-precision
+///    floating point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [3:0].
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_movemask_pd(__m256d __a)
 {
   return __builtin_ia32_movmskpd256((__v4df)__a);
 }
 
+/// Extracts the sign bits of single-precision floating point elements
+///    in a 256-bit vector of [8 x float] and writes them to the lower order
+///    bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the single-precision floating
+///    point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [7:0].
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_movemask_ps(__m256 __a)
 {
@@ -2300,65 +2944,171 @@
 }
 
 /* Vector __zero */
-static __inline void __DEFAULT_FN_ATTRS
+/// Zeroes the contents of all XMM or YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
+static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
 _mm256_zeroall(void)
 {
   __builtin_ia32_vzeroall();
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
+static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
 _mm256_zeroupper(void)
 {
   __builtin_ia32_vzeroupper();
 }
 
 /* Vector load with broadcast */
-static __inline __m128 __DEFAULT_FN_ATTRS
+/// Loads a scalar single-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [4 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+///    The single-precision floating point value to be broadcast.
+/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
+///    equal to the broadcast value.
+static __inline __m128 __DEFAULT_FN_ATTRS128
 _mm_broadcast_ss(float const *__a)
 {
   float __f = *__a;
-  return (__m128)(__v4sf){ __f, __f, __f, __f };
+  return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
 }
 
+/// Loads a scalar double-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [4 x double] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
+///
+/// \param __a
+///    The double-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
+///    equal to the broadcast value.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_sd(double const *__a)
 {
   double __d = *__a;
-  return (__m256d)(__v4df){ __d, __d, __d, __d };
+  return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
 }
 
+/// Loads a scalar single-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [8 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+///    The single-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
+///    equal to the broadcast value.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ss(float const *__a)
 {
   float __f = *__a;
-  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+  return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
 }
 
+/// Loads the data from a 128-bit vector of [2 x double] from the
+///    specified address pointed to by \a __a and broadcasts it to 128-bit
+///    elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+///    The 128-bit vector of [2 x double] to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
+///    equal to the broadcast value.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_pd(__m128d const *__a)
 {
-  return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
+  __m128d __b = _mm_loadu_pd((const double *)__a);
+  return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
+                                          0, 1, 0, 1);
 }
 
+/// Loads the data from a 128-bit vector of [4 x float] from the
+///    specified address pointed to by \a __a and broadcasts it to 128-bit
+///    elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+///    The 128-bit vector of [4 x float] to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
+///    equal to the broadcast value.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ps(__m128 const *__a)
 {
-  return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
+  __m128 __b = _mm_loadu_ps((const float *)__a);
+  return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
+                                         0, 1, 2, 3, 0, 1, 2, 3);
 }
 
 /* SIMD load ops */
+/// Loads 4 double-precision floating point values from a 32-byte aligned
+///    memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing
+///    double-precision floating point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_load_pd(double const *__p)
 {
   return *(__m256d *)__p;
 }
 
+/// Loads 8 single-precision floating point values from a 32-byte aligned
+///    memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing float values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_load_ps(float const *__p)
 {
   return *(__m256 *)__p;
 }
 
+/// Loads 4 double-precision floating point values from an unaligned
+///    memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing double-precision floating
+///    point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu_pd(double const *__p)
 {
@@ -2368,6 +3118,17 @@
   return ((struct __loadu_pd*)__p)->__v;
 }
 
+/// Loads 8 single-precision floating point values from an unaligned
+///    memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing single-precision floating
+///    point values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu_ps(float const *__p)
 {
@@ -2377,12 +3138,33 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
+/// Loads 256 bits of integer data from a 32-byte aligned memory
+///    location pointed to by \a __p into elements of a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
+///    values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_load_si256(__m256i const *__p)
 {
   return *__p;
 }
 
+/// Loads 256 bits of integer data from an unaligned memory location
+///    pointed to by \a __p into a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_loadu_si256(__m256i const *__p)
 {
@@ -2392,6 +3174,18 @@
   return ((struct __loadu_si256*)__p)->__v;
 }
 
+/// Loads 256 bits of integer data from an unaligned memory location
+///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
+///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
+///    line boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_lddqu_si256(__m256i const *__p)
 {
@@ -2399,18 +3193,55 @@
 }
 
 /* SIMD store ops */
+/// Stores double-precision floating point values from a 256-bit vector
+///    of [4 x double] to a 32-byte aligned memory location pointed to by
+///    \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    double-precision floaing point values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_pd(double *__p, __m256d __a)
 {
   *(__m256d *)__p = __a;
 }
 
+/// Stores single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_ps(float *__p, __m256 __a)
 {
   *(__m256 *)__p = __a;
 }
 
+/// Stores double-precision floating point values from a 256-bit vector
+///    of [4 x double] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the double-precision
+///    floating point values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_pd(double *__p, __m256d __a)
 {
@@ -2420,6 +3251,17 @@
   ((struct __storeu_pd*)__p)->__v = __a;
 }
 
+/// Stores single-precision floating point values from a 256-bit vector
+///    of [8 x float] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_ps(float *__p, __m256 __a)
 {
@@ -2429,12 +3271,35 @@
   ((struct __storeu_ps*)__p)->__v = __a;
 }
 
+/// Stores integer values from a 256-bit integer vector to a 32-byte
+///    aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_si256(__m256i *__p, __m256i __a)
 {
   *__p = __a;
 }
 
+/// Stores integer values from a 256-bit integer vector to an unaligned
+///    memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_si256(__m256i *__p, __m256i __a)
 {
@@ -2445,12 +3310,48 @@
 }
 
 /* Conditional load ops */
-static __inline __m128d __DEFAULT_FN_ATTRS
+/// Conditionally loads double-precision floating point elements from a
+///    memory location pointed to by \a __p into a 128-bit vector of
+///    [2 x double], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the double-precision
+///    floating point values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each data element represents the mask bits. If a mask bit is zero, the
+///    corresponding value in the memory location is not loaded and the
+///    corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline __m128d __DEFAULT_FN_ATTRS128
 _mm_maskload_pd(double const *__p, __m128i __m)
 {
   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
 }
 
+/// Conditionally loads double-precision floating point elements from a
+///    memory location pointed to by \a __p into a 256-bit vector of
+///    [4 x double], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the double-precision
+///    floating point values.
+/// \param __m
+///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
+///    significant bit of each quadword element represents the mask bits. If a
+///    mask bit is zero, the corresponding value in the memory location is not
+///    loaded and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [4 x double] containing the loaded values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_maskload_pd(double const *__p, __m256i __m)
 {
@@ -2458,12 +3359,48 @@
                                                (__v4di)__m);
 }
 
-static __inline __m128 __DEFAULT_FN_ATTRS
+/// Conditionally loads single-precision floating point elements from a
+///    memory location pointed to by \a __p into a 128-bit vector of
+///    [4 x float], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the single-precision
+///    floating point values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each data element represents the mask bits. If a mask bit is zero, the
+///    corresponding value in the memory location is not loaded and the
+///    corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
+static __inline __m128 __DEFAULT_FN_ATTRS128
 _mm_maskload_ps(float const *__p, __m128i __m)
 {
   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
 }
 
+/// Conditionally loads single-precision floating point elements from a
+///    memory location pointed to by \a __p into a 256-bit vector of
+///    [8 x float], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the single-precision
+///    floating point values.
+/// \param __m
+///    A 256-bit integer vector of [8 x dword] containing the mask. The most
+///    significant bit of each dword element represents the mask bits. If a mask
+///    bit is zero, the corresponding value in the memory location is not loaded
+///    and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [8 x float] containing the loaded values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_maskload_ps(float const *__p, __m256i __m)
 {
@@ -2471,98 +3408,427 @@
 }
 
 /* Conditional store ops */
+/// Moves single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a memory location pointed to by \a __p, according to
+///    the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 256-bit integer vector of [8 x dword] containing the mask. The most
+///    significant bit of each dword element in the mask vector represents the
+///    mask bits. If a mask bit is zero, the corresponding value from vector
+///    \a __a is not stored and the corresponding field in the memory location
+///    pointed to by \a __p is not changed.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
 {
   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+/// Moves double-precision values from a 128-bit vector of [2 x double]
+///    to a memory location pointed to by \a __p, according to the specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each field in the mask vector represents the mask bits. If a mask bit is
+///    zero, the corresponding value from vector \a __a is not stored and the
+///    corresponding field in the memory location pointed to by \a __p is not
+///    changed.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS128
 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
 {
   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
 }
 
+/// Moves double-precision values from a 256-bit vector of [4 x double]
+///    to a memory location pointed to by \a __p, according to the specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
+///    significant bit of each quadword element in the mask vector represents
+///    the mask bits. If a mask bit is zero, the corresponding value from vector
+///    __a is not stored and the corresponding field in the memory location
+///    pointed to by \a __p is not changed.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
 {
   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
 }
 
-static __inline void __DEFAULT_FN_ATTRS
+/// Moves single-precision floating point values from a 128-bit vector
+///    of [4 x float] to a memory location pointed to by \a __p, according to
+///    the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each field in the mask vector represents the mask bits. If a mask bit is
+///    zero, the corresponding value from vector __a is not stored and the
+///    corresponding field in the memory location pointed to by \a __p is not
+///    changed.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS128
 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 {
   __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
 }
 
 /* Cacheability support ops */
+/// Moves integer data from a 256-bit integer vector to a 32-byte
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    integer values.
+/// \param __b
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_si256(__m256i *__a, __m256i __b)
 {
-  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
+  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
+  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
 }
 
+/// Moves double-precision values from a 256-bit vector of [4 x double]
+///    to a 32-byte aligned memory location. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    double-precision floating-point values.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_pd(double *__a, __m256d __b)
 {
-  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
+  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
+  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
 }
 
+/// Moves single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a 32-byte aligned memory location. To minimize
+///    caching, the data is flagged as non-temporal (unlikely to be used again
+///    soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    single-precision floating point values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_ps(float *__p, __m256 __a)
 {
-  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
+  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
+  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
 }
 
 /* Create vectors */
+/// Create a 256-bit vector of [4 x double] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [4 x double] containing undefined values.
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_undefined_pd(void)
 {
   return (__m256d)__builtin_ia32_undef256();
 }
 
+/// Create a 256-bit vector of [8 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [8 x float] containing undefined values.
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_undefined_ps(void)
 {
   return (__m256)__builtin_ia32_undef256();
 }
 
+/// Create a 256-bit integer vector with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit integer vector containing undefined values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_undefined_si256(void)
 {
   return (__m256i)__builtin_ia32_undef256();
 }
 
+/// Constructs a 256-bit floating-point vector of [4 x double]
+///    initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A double-precision floating-point value used to initialize bits [255:192]
+///    of the result.
+/// \param __b
+///    A double-precision floating-point value used to initialize bits [191:128]
+///    of the result.
+/// \param __c
+///    A double-precision floating-point value used to initialize bits [127:64]
+///    of the result.
+/// \param __d
+///    A double-precision floating-point value used to initialize bits [63:0]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set_pd(double __a, double __b, double __c, double __d)
 {
-  return (__m256d){ __d, __c, __b, __a };
+  return __extension__ (__m256d){ __d, __c, __b, __a };
 }
 
+/// Constructs a 256-bit floating-point vector of [8 x float] initialized
+///    with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __a
+///    A single-precision floating-point value used to initialize bits [255:224]
+///    of the result.
+/// \param __b
+///    A single-precision floating-point value used to initialize bits [223:192]
+///    of the result.
+/// \param __c
+///    A single-precision floating-point value used to initialize bits [191:160]
+///    of the result.
+/// \param __d
+///    A single-precision floating-point value used to initialize bits [159:128]
+///    of the result.
+/// \param __e
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __f
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __g
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __h
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set_ps(float __a, float __b, float __c, float __d,
               float __e, float __f, float __g, float __h)
 {
-  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+  return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
 }
 
+/// Constructs a 256-bit integer vector initialized with the specified
+///    32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i4
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i5
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i6
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i7
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
                  int __i4, int __i5, int __i6, int __i7)
 {
-  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+  return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
 }
 
+/// Constructs a 256-bit integer vector initialized with the specified
+///    16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __w15
+///    A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \param __w14
+///    A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w13
+///    A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w12
+///    A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w11
+///    A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w10
+///    A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w09
+///    A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w08
+///    A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w07
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w06
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w05
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w04
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w03
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w02
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w01
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w00
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
                  short __w11, short __w10, short __w09, short __w08,
                  short __w07, short __w06, short __w05, short __w04,
                  short __w03, short __w02, short __w01, short __w00)
 {
-  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+  return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
 }
 
+/// Constructs a 256-bit integer vector initialized with the specified
+///    8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __b31
+///    An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \param __b30
+///    An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b29
+///    An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b28
+///    An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b27
+///    An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b26
+///    An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b25
+///    An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b24
+///    An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b23
+///    An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b22
+///    An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b21
+///    An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b20
+///    An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b19
+///    An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b18
+///    An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b17
+///    An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b16
+///    An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b09
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b08
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b07
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b06
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b05
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b04
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b03
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b02
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b01
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b00
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
                 char __b27, char __b26, char __b25, char __b24,
@@ -2573,7 +3839,7 @@
                 char __b07, char __b06, char __b05, char __b04,
                 char __b03, char __b02, char __b01, char __b00)
 {
-  return (__m256i)(__v32qi){
+  return __extension__ (__m256i)(__v32qi){
     __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
     __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
     __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
@@ -2581,43 +3847,257 @@
   };
 }
 
+/// Constructs a 256-bit integer vector initialized with the specified
+///    64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \param __b
+///    A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __c
+///    A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __d
+///    A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
-  return (__m256i)(__v4di){ __d, __c, __b, __a };
+  return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
 }
 
 /* Create vectors with elements in reverse order */
+/// Constructs a 256-bit floating-point vector of [4 x double],
+///    initialized in reverse order with the specified double-precision
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A double-precision floating-point value used to initialize bits [63:0]
+///    of the result.
+/// \param __b
+///    A double-precision floating-point value used to initialize bits [127:64]
+///    of the result.
+/// \param __c
+///    A double-precision floating-point value used to initialize bits [191:128]
+///    of the result.
+/// \param __d
+///    A double-precision floating-point value used to initialize bits [255:192]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setr_pd(double __a, double __b, double __c, double __d)
 {
-  return (__m256d){ __a, __b, __c, __d };
+  return _mm256_set_pd(__d, __c, __b, __a);
 }
 
+/// Constructs a 256-bit floating-point vector of [8 x float],
+///    initialized in reverse order with the specified single-precision
+///    float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __a
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \param __b
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __c
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __d
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __e
+///    A single-precision floating-point value used to initialize bits [159:128]
+///    of the result.
+/// \param __f
+///    A single-precision floating-point value used to initialize bits [191:160]
+///    of the result.
+/// \param __g
+///    A single-precision floating-point value used to initialize bits [223:192]
+///    of the result.
+/// \param __h
+///    A single-precision floating-point value used to initialize bits [255:224]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setr_ps(float __a, float __b, float __c, float __d,
                float __e, float __f, float __g, float __h)
 {
-  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
+  return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
 }
 
+/// Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i4
+///    A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i5
+///    A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i6
+///    A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i7
+///    A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
                   int __i4, int __i5, int __i6, int __i7)
 {
-  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
+  return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
 }
 
+/// Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __w15
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w14
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w13
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w12
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w11
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w10
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w09
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w08
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w07
+///    A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w06
+///    A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w05
+///    A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w04
+///    A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w03
+///    A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w02
+///    A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w01
+///    A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w00
+///    A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
        short __w11, short __w10, short __w09, short __w08,
        short __w07, short __w06, short __w05, short __w04,
        short __w03, short __w02, short __w01, short __w00)
 {
-  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
-    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
+  return _mm256_set_epi16(__w00, __w01, __w02, __w03,
+                          __w04, __w05, __w06, __w07,
+                          __w08, __w09, __w10, __w11,
+                          __w12, __w13, __w14, __w15);
 }
 
+/// Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __b31
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b30
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b29
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b28
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b27
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b26
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b25
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b24
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b23
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b22
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b21
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b20
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b19
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b18
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b17
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b16
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b09
+///    An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b08
+///    An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b07
+///    An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b06
+///    An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b05
+///    An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b04
+///    An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b03
+///    An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b02
+///    An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b01
+///    An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b00
+///    An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
                  char __b27, char __b26, char __b25, char __b24,
@@ -2628,216 +4108,678 @@
                  char __b07, char __b06, char __b05, char __b04,
                  char __b03, char __b02, char __b01, char __b00)
 {
-  return (__m256i)(__v32qi){
-    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
-    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
-    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
-    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+  return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+                         __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+                         __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+                         __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
 }
 
+/// Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \param __b
+///    A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __c
+///    A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __d
+///    A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \returns An initialized 256-bit integer vector.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
-  return (__m256i)(__v4di){ __a, __b, __c, __d };
+  return _mm256_set_epi64x(__d, __c, __b, __a);
 }
 
 /* Create vectors with repeated elements */
+/// Constructs a 256-bit floating-point vector of [4 x double], with each
+///    of the four double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set1_pd(double __w)
 {
-  return (__m256d){ __w, __w, __w, __w };
+  return _mm256_set_pd(__w, __w, __w, __w);
 }
 
+/// Constructs a 256-bit floating-point vector of [8 x float], with each
+///    of the eight single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set1_ps(float __w)
 {
-  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
+  return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
 }
 
+/// Constructs a 256-bit integer vector of [8 x i32], with each of the
+///    32-bit integral vector elements set to the specified 32-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __i
+///    A 32-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [8 x i32].
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi32(int __i)
 {
-  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
+  return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
 }
 
+/// Constructs a 256-bit integer vector of [16 x i16], with each of the
+///    16-bit integral vector elements set to the specified 16-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __w
+///    A 16-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [16 x i16].
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi16(short __w)
 {
-  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w };
+  return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
+                          __w, __w, __w, __w, __w, __w, __w, __w);
 }
 
+/// Constructs a 256-bit integer vector of [32 x i8], with each of the
+///    8-bit integral vector elements set to the specified 8-bit integral value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __b
+///    An 8-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [32 x i8].
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi8(char __b)
 {
-  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
-    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
-    __b, __b, __b, __b, __b, __b, __b };
+  return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
+                         __b, __b, __b, __b, __b, __b, __b, __b,
+                         __b, __b, __b, __b, __b, __b, __b, __b,
+                         __b, __b, __b, __b, __b, __b, __b, __b);
 }
 
+/// Constructs a 256-bit integer vector of [4 x i64], with each of the
+///    64-bit integral vector elements set to the specified 64-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __q
+///    A 64-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [4 x i64].
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi64x(long long __q)
 {
-  return (__m256i)(__v4di){ __q, __q, __q, __q };
+  return _mm256_set_epi64x(__q, __q, __q, __q);
 }
 
 /* Create __zeroed vectors */
+/// Constructs a 256-bit floating-point vector of [4 x double] with all
+///    vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setzero_pd(void)
 {
-  return (__m256d){ 0, 0, 0, 0 };
+  return __extension__ (__m256d){ 0, 0, 0, 0 };
 }
 
+/// Constructs a 256-bit floating-point vector of [8 x float] with all
+///    vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setzero_ps(void)
 {
-  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+  return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
+/// Constructs a 256-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit integer vector initialized to zero.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setzero_si256(void)
 {
-  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
+  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
 }
 
 /* Cast between vector types */
+/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+///    floating-point vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castpd_ps(__m256d __a)
 {
   return (__m256)__a;
 }
 
+/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castpd_si256(__m256d __a)
 {
   return (__m256i)__a;
 }
 
+/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+///    floating-point vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castps_pd(__m256 __a)
 {
   return (__m256d)__a;
 }
 
+/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castps_si256(__m256 __a)
 {
   return (__m256i)__a;
 }
 
+/// Casts a 256-bit integer vector into a 256-bit floating-point vector
+///    of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castsi256_ps(__m256i __a)
 {
   return (__m256)__a;
 }
 
+/// Casts a 256-bit integer vector into a 256-bit floating-point vector
+///    of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castsi256_pd(__m256i __a)
 {
   return (__m256d)__a;
 }
 
+/// Returns the lower 128 bits of a 256-bit floating-point vector of
+///    [4 x double] as a 128-bit floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the
+///    lower 128 bits of the parameter.
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm256_castpd256_pd128(__m256d __a)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
 }
 
+/// Returns the lower 128 bits of a 256-bit floating-point vector of
+///    [8 x float] as a 128-bit floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the
+///    lower 128 bits of the parameter.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm256_castps256_ps128(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
 }
 
+/// Truncates a 256-bit integer vector into a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 128-bit integer vector containing the lower 128 bits of the
+///    parameter.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_castsi256_si128(__m256i __a)
 {
   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
 }
 
+/// Constructs a 256-bit floating-point vector of [4 x double] from a
+///    128-bit floating-point vector of [2 x double].
+///
+///    The lower 128 bits contain the value of the source vector. The contents
+///    of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+///    contain the value of the parameter. The contents of the upper 128 bits
+///    are undefined.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castpd128_pd256(__m128d __a)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
 }
 
+/// Constructs a 256-bit floating-point vector of [8 x float] from a
+///    128-bit floating-point vector of [4 x float].
+///
+///    The lower 128 bits contain the value of the source vector. The contents
+///    of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+///    contain the value of the parameter. The contents of the upper 128 bits
+///    are undefined.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castps128_ps256(__m128 __a)
 {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+///
+///    The lower 128 bits contain the value of the source vector. The contents
+///    of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The contents of the upper 128 bits are undefined.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castsi128_si256(__m128i __a)
 {
   return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
 }
 
+/// Constructs a 256-bit floating-point vector of [4 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The upper 128 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+///    contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_zextpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The upper 128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+///    contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_zextps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The upper
+///    128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The upper 128 bits are set to zero.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_zextsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
+}
+
 /*
    Vector insert.
    We use macros rather than inlines because we only want to accept
    invocations where the immediate M is a constant expression.
 */
-#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
-  (__m256)__builtin_shufflevector( \
-    (__v8sf)(__m256)(V1), \
-    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
-    (((M) & 1) ?  0 :  8), \
-    (((M) & 1) ?  1 :  9), \
-    (((M) & 1) ?  2 : 10), \
-    (((M) & 1) ?  3 : 11), \
-    (((M) & 1) ?  8 :  4), \
-    (((M) & 1) ?  9 :  5), \
-    (((M) & 1) ? 10 :  6), \
-    (((M) & 1) ? 11 :  7) );})
+/// Constructs a new 256-bit vector of [8 x float] by first duplicating
+///    a 256-bit vector of [8 x float] given in the first parameter, and then
+///    replacing either the upper or the lower 128 bits with the contents of a
+///    128-bit vector of [4 x float] in the second parameter.
+///
+///    The immediate integer parameter determines between the upper or the lower
+///    128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float]. This vector is copied to the result
+///    first, and then either the upper or the lower 128 bits of the result will
+///    be replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit vector of [4 x float]. The contents of this parameter are
+///    written to either the upper or the lower 128 bits of the result depending
+///    on the value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+#define _mm256_insertf128_ps(V1, V2, M) \
+  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
+                                           (__v4sf)(__m128)(V2), (int)(M))
 
-#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
-  (__m256d)__builtin_shufflevector( \
-    (__v4df)(__m256d)(V1), \
-    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
-    (((M) & 1) ? 0 : 4), \
-    (((M) & 1) ? 1 : 5), \
-    (((M) & 1) ? 4 : 2), \
-    (((M) & 1) ? 5 : 3) );})
+/// Constructs a new 256-bit vector of [4 x double] by first duplicating
+///    a 256-bit vector of [4 x double] given in the first parameter, and then
+///    replacing either the upper or the lower 128 bits with the contents of a
+///    128-bit vector of [2 x double] in the second parameter.
+///
+///    The immediate integer parameter determines between the upper or the lower
+///    128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double]. This vector is copied to the result
+///    first, and then either the upper or the lower 128 bits of the result will
+///    be replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit vector of [2 x double]. The contents of this parameter are
+///    written to either the upper or the lower 128 bits of the result depending
+///    on the value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+#define _mm256_insertf128_pd(V1, V2, M) \
+  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
+                                            (__v2df)(__m128d)(V2), (int)(M))
 
-#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
-  (__m256i)__builtin_shufflevector( \
-    (__v4di)(__m256i)(V1), \
-    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
-    (((M) & 1) ? 0 : 4), \
-    (((M) & 1) ? 1 : 5), \
-    (((M) & 1) ? 4 : 2), \
-    (((M) & 1) ? 5 : 3) );})
+/// Constructs a new 256-bit integer vector by first duplicating a
+///    256-bit integer vector given in the first parameter, and then replacing
+///    either the upper or the lower 128 bits with the contents of a 128-bit
+///    integer vector in the second parameter.
+///
+///    The immediate integer parameter determines between the upper or the lower
+///    128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit integer vector. This vector is copied to the result first, and
+///    then either the upper or the lower 128 bits of the result will be
+///    replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit integer vector. The contents of this parameter are written to
+///    either the upper or the lower 128 bits of the result depending on the
+///     value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit integer vector containing the interleaved values.
+#define _mm256_insertf128_si256(V1, V2, M) \
+  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
+                                            (__v4si)(__m128i)(V2), (int)(M))
 
 /*
    Vector extract.
    We use macros rather than inlines because we only want to accept
    invocations where the immediate M is a constant expression.
 */
-#define _mm256_extractf128_ps(V, M) __extension__ ({ \
-  (__m128)__builtin_shufflevector( \
-    (__v8sf)(__m256)(V), \
-    (__v8sf)(_mm256_undefined_ps()), \
-    (((M) & 1) ? 4 : 0), \
-    (((M) & 1) ? 5 : 1), \
-    (((M) & 1) ? 6 : 2), \
-    (((M) & 1) ? 7 : 3) );})
+/// Extracts either the upper or the lower 128 bits from a 256-bit vector
+///    of [8 x float], as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter: \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
+#define _mm256_extractf128_ps(V, M) \
+  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
 
-#define _mm256_extractf128_pd(V, M) __extension__ ({ \
-  (__m128d)__builtin_shufflevector( \
-    (__v4df)(__m256d)(V), \
-    (__v4df)(_mm256_undefined_pd()), \
-    (((M) & 1) ? 2 : 0), \
-    (((M) & 1) ? 3 : 1) );})
+/// Extracts either the upper or the lower 128 bits from a 256-bit vector
+///    of [4 x double], as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter: \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
+#define _mm256_extractf128_pd(V, M) \
+  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
 
-#define _mm256_extractf128_si256(V, M) __extension__ ({ \
-  (__m128i)__builtin_shufflevector( \
-    (__v4di)(__m256i)(V), \
-    (__v4di)(_mm256_undefined_si256()), \
-    (((M) & 1) ? 2 : 0), \
-    (((M) & 1) ? 3 : 1) );})
+/// Extracts either the upper or the lower 128 bits from a 256-bit
+///    integer vector, as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter:  \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit integer vector containing the extracted bits.
+#define _mm256_extractf128_si256(V, M) \
+  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
 
 /* SIMD load ops (unaligned) */
+/// Loads two 128-bit floating-point vectors of [4 x float] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [8 x float] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
 {
@@ -2845,6 +4787,27 @@
   return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
 }
 
+/// Loads two 128-bit floating-point vectors of [2 x double] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [4 x double] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
 {
@@ -2852,6 +4815,24 @@
   return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
 }
 
+/// Loads two 128-bit integer vectors from unaligned memory locations and
+///    constructs a 256-bit integer vector by concatenating the two 128-bit
+///    vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[255:128] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[127:0] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \returns A 256-bit integer vector containing the concatenated result.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
 {
@@ -2860,6 +4841,24 @@
 }
 
 /* SIMD store ops (unaligned) */
+/// Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [8 x float] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
 {
@@ -2871,6 +4870,24 @@
   _mm_storeu_ps(__addr_hi, __v128);
 }
 
+/// Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [4 x double] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
 {
@@ -2882,6 +4899,24 @@
   _mm_storeu_pd(__addr_hi, __v128);
 }
 
+/// Stores the upper and lower 128 bits of a 256-bit integer vector into
+///    two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit integer vector.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
 {
@@ -2893,36 +4928,136 @@
   _mm_storeu_si128(__addr_hi, __v128);
 }
 
+/// Constructs a 256-bit floating-point vector of [8 x float] by
+///    concatenating two 128-bit floating-point vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
+///    128 bits of the result.
+/// \param __lo
+///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
 static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set_m128 (__m128 __hi, __m128 __lo) {
+_mm256_set_m128 (__m128 __hi, __m128 __lo)
+{
   return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
+/// Constructs a 256-bit floating-point vector of [4 x double] by
+///    concatenating two 128-bit floating-point vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
+///    128 bits of the result.
+/// \param __lo
+///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
 static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set_m128d (__m128d __hi, __m128d __lo) {
-  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+_mm256_set_m128d (__m128d __hi, __m128d __lo)
+{
+  return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
 }
 
+/// Constructs a 256-bit integer vector by concatenating two 128-bit
+///    integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit integer vector to be copied to the upper 128 bits of the
+///    result.
+/// \param __lo
+///    A 128-bit integer vector to be copied to the lower 128 bits of the
+///    result.
+/// \returns A 256-bit integer vector containing the concatenated result.
 static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_m128i (__m128i __hi, __m128i __lo) {
-  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+_mm256_set_m128i (__m128i __hi, __m128i __lo)
+{
+  return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
 }
 
+/// Constructs a 256-bit floating-point vector of [8 x float] by
+///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
+///    similar to _mm256_set_m128, but the order of the input parameters is
+///    swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
+///    128 bits of the result.
+/// \param __hi
+///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
 static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
+_mm256_setr_m128 (__m128 __lo, __m128 __hi)
+{
   return _mm256_set_m128(__hi, __lo);
 }
 
+/// Constructs a 256-bit floating-point vector of [4 x double] by
+///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
+///    similar to _mm256_set_m128d, but the order of the input parameters is
+///    swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
+///    128 bits of the result.
+/// \param __hi
+///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
 static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
-  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+_mm256_setr_m128d (__m128d __lo, __m128d __hi)
+{
+  return (__m256d)_mm256_set_m128d(__hi, __lo);
 }
 
+/// Constructs a 256-bit integer vector by concatenating two 128-bit
+///    integer vectors. This is similar to _mm256_set_m128i, but the order of
+///    the input parameters is swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit integer vector to be copied to the lower 128 bits of the
+///    result.
+/// \param __hi
+///    A 128-bit integer vector to be copied to the upper 128 bits of the
+///    result.
+/// \returns A 256-bit integer vector containing the concatenated result.
 static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
-  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+_mm256_setr_m128i (__m128i __lo, __m128i __hi)
+{
+  return (__m256i)_mm256_set_m128i(__hi, __lo);
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
 
 #endif /* __AVXINTRIN_H */
diff --git a/darwin-x86/clang-headers/bmiintrin.h b/darwin-x86/clang-headers/bmiintrin.h
index 30acfae..d03bef4 100644
--- a/darwin-x86/clang-headers/bmiintrin.h
+++ b/darwin-x86/clang-headers/bmiintrin.h
@@ -28,107 +28,17 @@
 #ifndef __BMIINTRIN_H
 #define __BMIINTRIN_H
 
-/// \brief Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned short _tzcnt_u16(unsigned short a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param a
-///    An unsigned 16-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 16-bit integer containing the number of trailing zero
-///    bits in the operand.
 #define _tzcnt_u16(a)     (__tzcnt_u16((a)))
 
-/// \brief Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _andn_u32(unsigned int a, unsigned int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ANDN instruction.
-///
-/// \param a
-///    An unsigned integer containing one of the operands.
-/// \param b
-///    An unsigned integer containing one of the operands.
-/// \returns An unsigned integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
 #define _andn_u32(a, b)   (__andn_u32((a), (b)))
 
 /* _bextr_u32 != __bextr_u32 */
-/// \brief Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _blsi_u32(unsigned int a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSI instruction.
-///
-/// \param a
-///    An unsigned integer whose bits are to be cleared.
-/// \returns An unsigned integer containing the result of clearing the bits from
-///    the source operand.
 #define _blsi_u32(a)      (__blsi_u32((a)))
 
-/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least siginificant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _blsmsk_u32(unsigned int a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
-///
-/// \param a
-///    An unsigned integer used to create the mask.
-/// \returns An unsigned integer containing the newly created mask.
 #define _blsmsk_u32(a)    (__blsmsk_u32((a)))
 
-/// \brief Clears the least siginificant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _blsr_u32(unsigned int a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSR instruction.
-///
-/// \param a
-///    An unsigned integer containing the operand to be cleared.
-/// \returns An unsigned integer containing the result of clearing the source
-///    operand.
 #define _blsr_u32(a)      (__blsr_u32((a)))
 
-/// \brief Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _tzcnt_u32(unsigned int a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param a
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of trailing zero
-///    bits in the operand.
 #define _tzcnt_u32(a)     (__tzcnt_u32((a)))
 
 /* Define the default attributes for the functions in this file. */
@@ -139,11 +49,11 @@
    to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 
-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 16-bit integer whose trailing zeros are to be counted.
@@ -155,12 +65,12 @@
   return __X ? __builtin_ctzs(__X) : 16;
 }
 
-/// \brief Performs a bitwise AND of the second operand with the one's
+/// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing one of the operands.
@@ -175,12 +85,12 @@
 }
 
 /* AMD-specified, double-leading-underscore version of BEXTR */
-/// \brief Extracts the specified bits from the first operand and returns them
+/// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@@ -190,6 +100,7 @@
 ///    number of bits to be extracted.
 /// \returns An unsigned integer whose least significant bits contain the
 ///    extracted bits.
+/// \see _bextr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __bextr_u32(unsigned int __X, unsigned int __Y)
 {
@@ -197,12 +108,12 @@
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR */
-/// \brief Extracts the specified bits from the first operand and returns them
+/// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@@ -214,18 +125,19 @@
 ///    Bits [7:0] specify the number of bits.
 /// \returns An unsigned integer whose least significant bits contain the
 ///    extracted bits.
+/// \see __bextr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 {
   return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
-/// \brief Clears all bits in the source except for the least significant bit
+/// Clears all bits in the source except for the least significant bit
 ///    containing a value of 1 and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be cleared.
@@ -237,13 +149,13 @@
   return __X & -__X;
 }
 
-/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least siginificant bit that is set to 1 in the source
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer used to create the mask.
@@ -254,12 +166,12 @@
   return __X ^ (__X - 1);
 }
 
-/// \brief Clears the least siginificant bit that is set to 1 in the source
+/// Clears the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing the operand to be cleared.
@@ -271,11 +183,11 @@
   return __X & (__X - 1);
 }
 
-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
@@ -287,16 +199,16 @@
   return __X ? __builtin_ctz(__X) : 32;
 }
 
-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero
-///    bits in the operand.
+/// \returns An 32-bit integer containing the number of trailing zero bits in
+///    the operand.
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
@@ -305,99 +217,23 @@
 
 #ifdef __x86_64__
 
-/// \brief Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ANDN instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer containing one of the operands.
-/// \param b
-///    An unsigned 64-bit integer containing one of the operands.
-/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
 #define _andn_u64(a, b)   (__andn_u64((a), (b)))
 
 /* _bextr_u64 != __bextr_u64 */
-/// \brief Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsi_u64(unsigned long long a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSI instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer whose bits are to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    bits from the source operand.
 #define _blsi_u64(a)      (__blsi_u64((a)))
 
-/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least siginificant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsmsk_u64(unsigned long long a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer used to create the mask.
-/// \returns A unsigned 64-bit integer containing the newly created mask.
 #define _blsmsk_u64(a)    (__blsmsk_u64((a)))
 
-/// \brief Clears the least siginificant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsr_u64(unsigned long long a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSR instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer containing the operand to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    source operand.
 #define _blsr_u64(a)      (__blsr_u64((a)))
 
-/// \brief Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _tzcnt_u64(unsigned long long a);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of trailing zero
-///    bits in the operand.
 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
 
-/// \brief Performs a bitwise AND of the second operand with the one's
+/// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing one of the operands.
@@ -412,12 +248,12 @@
 }
 
 /* AMD-specified, double-leading-underscore version of BEXTR */
-/// \brief Extracts the specified bits from the first operand and returns them
+/// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@@ -427,6 +263,7 @@
 ///    the number of bits to be extracted.
 /// \returns An unsigned 64-bit integer whose least significant bits contain the
 ///    extracted bits.
+/// \see _bextr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __bextr_u64(unsigned long long __X, unsigned long long __Y)
 {
@@ -434,12 +271,12 @@
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR */
-/// \brief Extracts the specified bits from the first operand and returns them
+/// Extracts the specified bits from the first operand and returns them
 ///     in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@@ -451,18 +288,19 @@
 ///    Bits [7:0] specify the number of bits.
 /// \returns An unsigned 64-bit integer whose least significant bits contain the
 ///    extracted bits.
+/// \see __bextr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 {
   return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
-/// \brief Clears all bits in the source except for the least significant bit
+/// Clears all bits in the source except for the least significant bit
 ///    containing a value of 1 and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be cleared.
@@ -474,29 +312,29 @@
   return __X & -__X;
 }
 
-/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least siginificant bit that is set to 1 in the source
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer used to create the mask.
-/// \returns A unsigned 64-bit integer containing the newly created mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsmsk_u64(unsigned long long __X)
 {
   return __X ^ (__X - 1);
 }
 
-/// \brief Clears the least siginificant bit that is set to 1 in the source
+/// Clears the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing the operand to be cleared.
@@ -508,11 +346,11 @@
   return __X & (__X - 1);
 }
 
-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
@@ -524,16 +362,16 @@
   return __X ? __builtin_ctzll(__X) : 64;
 }
 
-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An 64-bit integer containing the number of trailing zero
-///    bits in the operand.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+///    the operand.
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
diff --git a/darwin-x86/clang-headers/cetintrin.h b/darwin-x86/clang-headers/cetintrin.h
new file mode 100644
index 0000000..120c954
--- /dev/null
+++ b/darwin-x86/clang-headers/cetintrin.h
@@ -0,0 +1,113 @@
+/*===---- cetintrin.h - CET intrinsic --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CETINTRIN_H
+#define __CETINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
+
+static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
+  __builtin_ia32_incsspd(__a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
+  __builtin_ia32_incsspq(__a);
+}
+#endif /* __x86_64__ */
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
+  __builtin_ia32_incsspq(__a);
+}
+#else /* __x86_64__ */
+static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
+  __builtin_ia32_incsspd((int)__a);
+}
+#endif /* __x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
+  return __builtin_ia32_rdsspd(__a);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
+  return __builtin_ia32_rdsspq(__a);
+}
+#endif /* __x86_64__ */
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
+  return __builtin_ia32_rdsspq(0);
+}
+#else /* __x86_64__ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
+  return __builtin_ia32_rdsspd(0);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
+  __builtin_ia32_saveprevssp();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
+  __builtin_ia32_rstorssp(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
+  __builtin_ia32_wrssd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
+  __builtin_ia32_wrssq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
+  __builtin_ia32_wrussd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
+  __builtin_ia32_wrussq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
+  __builtin_ia32_setssbsy();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
+  __builtin_ia32_clrssbsy(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CETINTRIN_H */
diff --git a/darwin-x86/clang-headers/cldemoteintrin.h b/darwin-x86/clang-headers/cldemoteintrin.h
new file mode 100644
index 0000000..fa78148
--- /dev/null
+++ b/darwin-x86/clang-headers/cldemoteintrin.h
@@ -0,0 +1,42 @@
+/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __CLDEMOTEINTRIN_H
+#define __CLDEMOTEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("cldemote")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_cldemote(const void * __P) {
+  __builtin_ia32_cldemote(__P);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/darwin-x86/clang-headers/clflushoptintrin.h b/darwin-x86/clang-headers/clflushoptintrin.h
index 60e0ead..79bb458 100644
--- a/darwin-x86/clang-headers/clflushoptintrin.h
+++ b/darwin-x86/clang-headers/clflushoptintrin.h
@@ -1,4 +1,4 @@
-/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -32,7 +32,7 @@
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
 
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflushopt(char * __m) {
+_mm_clflushopt(void const * __m) {
   __builtin_ia32_clflushopt(__m);
 }
 
diff --git a/darwin-x86/clang-headers/clwbintrin.h b/darwin-x86/clang-headers/clwbintrin.h
new file mode 100644
index 0000000..c09286b
--- /dev/null
+++ b/darwin-x86/clang-headers/clwbintrin.h
@@ -0,0 +1,52 @@
+/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLWBINTRIN_H
+#define __CLWBINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clwb")))
+
+/// Writes back to memory the cache line (if modified) that contains the
+/// linear address specified in \a __p from any level of the cache hierarchy in
+/// the cache coherence domain
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLWB </c> instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    written back.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clwb(void const *__p) {
+  __builtin_ia32_clwb(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/darwin-x86/clang-headers/clzerointrin.h b/darwin-x86/clang-headers/clzerointrin.h
new file mode 100644
index 0000000..f4e9208
--- /dev/null
+++ b/darwin-x86/clang-headers/clzerointrin.h
@@ -0,0 +1,50 @@
+/*===----------------------- clzerointrin.h - CLZERO ----------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __CLZEROINTRIN_H
+#define __CLZEROINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("clzero")))
+
+/// Loads the cache line address and zero's out the cacheline
+///
+/// \headerfile <clzerointrin.h>
+///
+/// This intrinsic corresponds to the <c> CLZERO </c> instruction.
+///
+/// \param __line
+///    A pointer to a cacheline which needs to be zeroed out.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clzero (void * __line)
+{
+  __builtin_ia32_clzero ((void *)__line);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CLZEROINTRIN_H */
diff --git a/darwin-x86/clang-headers/cpuid.h b/darwin-x86/clang-headers/cpuid.h
index 5da02e0..fce6af5 100644
--- a/darwin-x86/clang-headers/cpuid.h
+++ b/darwin-x86/clang-headers/cpuid.h
@@ -79,9 +79,10 @@
 #define signature_VORTEX_edx 0x36387865
 #define signature_VORTEX_ecx 0x436f5320
 
-/* Features in %ecx for level 1 */
+/* Features in %ecx for leaf 1 */
 #define bit_SSE3        0x00000001
 #define bit_PCLMULQDQ   0x00000002
+#define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
 #define bit_DTES64      0x00000004
 #define bit_MONITOR     0x00000008
 #define bit_DSCPL       0x00000010
@@ -98,18 +99,22 @@
 #define bit_PCID        0x00020000
 #define bit_DCA         0x00040000
 #define bit_SSE41       0x00080000
+#define bit_SSE4_1      bit_SSE41       /* for gcc compat */
 #define bit_SSE42       0x00100000
+#define bit_SSE4_2      bit_SSE42       /* for gcc compat */
 #define bit_x2APIC      0x00200000
 #define bit_MOVBE       0x00400000
 #define bit_POPCNT      0x00800000
 #define bit_TSCDeadline 0x01000000
 #define bit_AESNI       0x02000000
+#define bit_AES         bit_AESNI       /* for gcc compat */
 #define bit_XSAVE       0x04000000
 #define bit_OSXSAVE     0x08000000
 #define bit_AVX         0x10000000
+#define bit_F16C        0x20000000
 #define bit_RDRND       0x40000000
 
-/* Features in %edx for level 1 */
+/* Features in %edx for leaf 1 */
 #define bit_FPU         0x00000001
 #define bit_VME         0x00000002
 #define bit_DE          0x00000004
@@ -119,6 +124,7 @@
 #define bit_PAE         0x00000040
 #define bit_MCE         0x00000080
 #define bit_CX8         0x00000100
+#define bit_CMPXCHG8B   bit_CX8         /* for gcc compat */
 #define bit_APIC        0x00000200
 #define bit_SEP         0x00000800
 #define bit_MTRR        0x00001000
@@ -133,7 +139,7 @@
 #define bit_ACPI        0x00400000
 #define bit_MMX         0x00800000
 #define bit_FXSR        0x01000000
-#define bit_FXSAVE      bit_FXSR    /* for gcc compat */
+#define bit_FXSAVE      bit_FXSR        /* for gcc compat */
 #define bit_SSE         0x02000000
 #define bit_SSE2        0x04000000
 #define bit_SS          0x08000000
@@ -141,44 +147,114 @@
 #define bit_TM          0x20000000
 #define bit_PBE         0x80000000
 
-/* Features in %ebx for level 7 sub-leaf 0 */
+/* Features in %ebx for leaf 7 sub-leaf 0 */
 #define bit_FSGSBASE    0x00000001
+#define bit_SGX         0x00000004
+#define bit_BMI         0x00000008
+#define bit_HLE         0x00000010
+#define bit_AVX2        0x00000020
 #define bit_SMEP        0x00000080
+#define bit_BMI2        0x00000100
 #define bit_ENH_MOVSB   0x00000200
+#define bit_INVPCID     0x00000400
+#define bit_RTM         0x00000800
+#define bit_MPX         0x00004000
+#define bit_AVX512F     0x00010000
+#define bit_AVX512DQ    0x00020000
+#define bit_RDSEED      0x00040000
+#define bit_ADX         0x00080000
+#define bit_AVX512IFMA  0x00200000
+#define bit_CLFLUSHOPT  0x00800000
+#define bit_CLWB        0x01000000
+#define bit_AVX512PF    0x04000000
+#define bit_AVX512ER    0x08000000
+#define bit_AVX512CD    0x10000000
+#define bit_SHA         0x20000000
+#define bit_AVX512BW    0x40000000
+#define bit_AVX512VL    0x80000000
+
+/* Features in %ecx for leaf 7 sub-leaf 0 */
+#define bit_PREFTCHWT1       0x00000001
+#define bit_AVX512VBMI       0x00000002
+#define bit_PKU              0x00000004
+#define bit_OSPKE            0x00000010
+#define bit_WAITPKG          0x00000020
+#define bit_AVX512VBMI2      0x00000040
+#define bit_SHSTK            0x00000080
+#define bit_GFNI             0x00000100
+#define bit_VAES             0x00000200
+#define bit_VPCLMULQDQ       0x00000400
+#define bit_AVX512VNNI       0x00000800
+#define bit_AVX512BITALG     0x00001000
+#define bit_AVX512VPOPCNTDQ  0x00004000
+#define bit_RDPID            0x00400000
+#define bit_CLDEMOTE         0x02000000
+#define bit_MOVDIRI          0x08000000
+#define bit_MOVDIR64B        0x10000000
+
+/* Features in %edx for leaf 7 sub-leaf 0 */
+#define bit_AVX5124VNNIW  0x00000004
+#define bit_AVX5124FMAPS  0x00000008
+#define bit_PCONFIG       0x00040000
+#define bit_IBT           0x00100000
+
+/* Features in %eax for leaf 13 sub-leaf 1 */
+#define bit_XSAVEOPT    0x00000001
+#define bit_XSAVEC      0x00000002
+#define bit_XSAVES      0x00000008
+
+/* Features in %eax for leaf 0x14 sub-leaf 0 */
+#define bit_PTWRITE     0x00000010
+
+/* Features in %ecx for leaf 0x80000001 */
+#define bit_LAHF_LM     0x00000001
+#define bit_ABM         0x00000020
+#define bit_LZCNT       bit_ABM        /* for gcc compat */
+#define bit_SSE4a       0x00000040
+#define bit_PRFCHW      0x00000100
+#define bit_XOP         0x00000800
+#define bit_LWP         0x00008000
+#define bit_FMA4        0x00010000
+#define bit_TBM         0x00200000
+#define bit_MWAITX      0x20000000
+
+/* Features in %edx for leaf 0x80000001 */
+#define bit_MMXEXT      0x00400000
+#define bit_LM          0x20000000
+#define bit_3DNOWP      0x40000000
+#define bit_3DNOW       0x80000000
+
+/* Features in %ebx for leaf 0x80000008 */
+#define bit_CLZERO      0x00000001
+#define bit_WBNOINVD    0x00000200
+
 
 #if __i386__
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
     __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-                  : "0"(__level))
+                  : "0"(__leaf))
 
-#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
     __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-                  : "0"(__level), "2"(__count))
+                  : "0"(__leaf), "2"(__count))
 #else
 /* x86-64 uses %rbx as the base register, so preserve it. */
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
     __asm("  xchgq  %%rbx,%q1\n" \
           "  cpuid\n" \
           "  xchgq  %%rbx,%q1" \
         : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-        : "0"(__level))
+        : "0"(__leaf))
 
-#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
     __asm("  xchgq  %%rbx,%q1\n" \
           "  cpuid\n" \
           "  xchgq  %%rbx,%q1" \
         : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-        : "0"(__level), "2"(__count))
+        : "0"(__leaf), "2"(__count))
 #endif
 
-static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
-                                 unsigned int *__ebx, unsigned int *__ecx,
-                                 unsigned int *__edx) {
-    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
-    return 1;
-}
-
-static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
 {
     unsigned int __eax, __ebx, __ecx, __edx;
 #if __i386__
@@ -202,8 +278,35 @@
         return 0;
 #endif
 
-    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    __cpuid(__leaf, __eax, __ebx, __ecx, __edx);
     if (__sig)
         *__sig = __ebx;
     return __eax;
 }
+
+static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_count (unsigned int __leaf,
+                                       unsigned int __subleaf,
+                                       unsigned int *__eax, unsigned int *__ebx,
+                                       unsigned int *__ecx, unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
diff --git a/darwin-x86/clang-headers/cuda_wrappers/algorithm b/darwin-x86/clang-headers/cuda_wrappers/algorithm
new file mode 100644
index 0000000..01af183
--- /dev/null
+++ b/darwin-x86/clang-headers/cuda_wrappers/algorithm
@@ -0,0 +1,116 @@
+/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
+#define __CLANG_CUDA_WRAPPERS_ALGORITHM
+
+// This header defines __device__ overloads of std::min/max.
+//
+// Ideally we'd declare these functions only if we're <= C++11.  In C++14,
+// these functions are constexpr, and so are implicitly __host__ __device__.
+//
+// However, the compiler being in C++14 mode does not imply that the standard
+// library supports C++14.  There is no macro we can test to check that the
+// stdlib has constexpr std::min/max.  Thus we have to unconditionally define
+// our device overloads.
+//
+// A host+device function cannot be overloaded, and a constexpr function
+// implicitly become host device if there's no explicitly host or device
+// overload preceding it.  So the simple thing to do would be to declare our
+// device min/max overloads, and then #include_next <algorithm>.  This way our
+// device overloads would come first, and so if we have a C++14 stdlib, its
+// min/max won't become host+device and conflict with our device overloads.
+//
+// But that also doesn't work.  libstdc++ is evil and declares std::min/max in
+// an internal header that is included *before* <algorithm>.  Thus by the time
+// we're inside of this file, std::min/max may already have been declared, and
+// thus we can't prevent them from becoming host+device if they're constexpr.
+//
+// Therefore we perpetrate the following hack: We mark our __device__ overloads
+// with __attribute__((enable_if(true, ""))).  This causes the signature of the
+// function to change without changing anything else about it.  (Except that
+// overload resolution will prefer it over the __host__ __device__ version
+// rather than considering them equally good).
+
+#include_next <algorithm>
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+#pragma push_macro("_CPP14_CONSTEXPR")
+#if __cplusplus >= 201402L
+#define _CPP14_CONSTEXPR constexpr
+#else
+#define _CPP14_CONSTEXPR
+#endif
+
+template <class __T, class __Cmp>
+__attribute__((enable_if(true, "")))
+inline _CPP14_CONSTEXPR __host__ __device__ const __T &
+max(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__a, __b) ? __b : __a;
+}
+
+template <class __T>
+__attribute__((enable_if(true, "")))
+inline _CPP14_CONSTEXPR __host__ __device__ const __T &
+max(const __T &__a, const __T &__b) {
+  return __a < __b ? __b : __a;
+}
+
+template <class __T, class __Cmp>
+__attribute__((enable_if(true, "")))
+inline _CPP14_CONSTEXPR __host__ __device__ const __T &
+min(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__b, __a) ? __b : __a;
+}
+
+template <class __T>
+__attribute__((enable_if(true, "")))
+inline _CPP14_CONSTEXPR __host__ __device__ const __T &
+min(const __T &__a, const __T &__b) {
+  return __a < __b ? __a : __b;
+}
+
+#pragma pop_macro("_CPP14_CONSTEXPR")
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
diff --git a/darwin-x86/clang-headers/cuda_wrappers/complex b/darwin-x86/clang-headers/cuda_wrappers/complex
new file mode 100644
index 0000000..11d40a8
--- /dev/null
+++ b/darwin-x86/clang-headers/cuda_wrappers/complex
@@ -0,0 +1,82 @@
+/*===---- complex - CUDA wrapper for <complex> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
+#define __CLANG_CUDA_WRAPPERS_COMPLEX
+
+// Wrapper around <complex> that forces its functions to be __host__
+// __device__.
+
+// First, include host-only headers we think are likely to be included by
+// <complex>, so that the pragma below only applies to <complex> itself.
+#if __cplusplus >= 201103L
+#include <type_traits>
+#endif
+#include <stdexcept>
+#include <cmath>
+#include <sstream>
+
+// Next, include our <algorithm> wrapper, to ensure that device overloads of
+// std::min/max are available.
+#include <algorithm>
+
+#pragma clang force_cuda_host_device begin
+
+// When compiling for device, ask libstdc++ to use its own implements of
+// complex functions, rather than calling builtins (which resolve to library
+// functions that don't exist when compiling CUDA device code).
+//
+// This is a little dicey, because it causes libstdc++ to define a different
+// set of overloads on host and device.
+//
+//   // Present only when compiling for host.
+//   __host__ __device__ void complex<float> sin(const complex<float>& x) {
+//     return __builtin_csinf(x);
+//   }
+//
+//   // Present when compiling for host and for device.
+//   template <typename T>
+//   void __host__ __device__ complex<T> sin(const complex<T>& x) {
+//     return complex<T>(sin(x.real()) * cosh(x.imag()),
+//                       cos(x.real()), sinh(x.imag()));
+//   }
+//
+// This is safe because when compiling for device, all function calls in
+// __host__ code to sin() will still resolve to *something*, even if they don't
+// resolve to the same function as they resolve to when compiling for host.  We
+// don't care that they don't resolve to the right function because we won't
+// codegen this host code when compiling for device.
+
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#define _GLIBCXX_USE_C99_COMPLEX 0
+#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
+
+#include_next <complex>
+
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
+
+#pragma clang force_cuda_host_device end
+
+#endif // include guard
diff --git a/darwin-x86/clang-headers/cuda_wrappers/new b/darwin-x86/clang-headers/cuda_wrappers/new
new file mode 100644
index 0000000..71b7a52
--- /dev/null
+++ b/darwin-x86/clang-headers/cuda_wrappers/new
@@ -0,0 +1,96 @@
+/*===---- complex - CUDA wrapper for <new> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_NEW
+#define __CLANG_CUDA_WRAPPERS_NEW
+
+#include_next <new>
+
+#pragma push_macro("CUDA_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define CUDA_NOEXCEPT noexcept
+#else
+#define CUDA_NOEXCEPT
+#endif
+
+// Device overrides for non-placement new and delete.
+__device__ inline void *operator new(__SIZE_TYPE__ size) {
+  if (size == 0) {
+    size = 1;
+  }
+  return ::malloc(size);
+}
+__device__ inline void *operator new(__SIZE_TYPE__ size,
+                                     const std::nothrow_t &) CUDA_NOEXCEPT {
+  return ::operator new(size);
+}
+
+__device__ inline void *operator new[](__SIZE_TYPE__ size) {
+  return ::operator new(size);
+}
+__device__ inline void *operator new[](__SIZE_TYPE__ size,
+                                       const std::nothrow_t &) {
+  return ::operator new(size);
+}
+
+__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
+  if (ptr) {
+    ::free(ptr);
+  }
+}
+__device__ inline void operator delete(void *ptr,
+                                       const std::nothrow_t &) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+__device__ inline void operator delete[](void *ptr,
+                                         const std::nothrow_t &) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+// Sized delete, C++14 only.
+#if __cplusplus >= 201402L
+__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+#endif
+
+// Device overrides for placement new and delete.
+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+
+#pragma pop_macro("CUDA_NOEXCEPT")
+
+#endif // include guard
diff --git a/darwin-x86/clang-headers/emmintrin.h b/darwin-x86/clang-headers/emmintrin.h
index c78d059..f0ea7cd 100644
--- a/darwin-x86/clang-headers/emmintrin.h
+++ b/darwin-x86/clang-headers/emmintrin.h
@@ -44,11 +44,25 @@
  * appear in the interface though. */
 typedef signed char __v16qs __attribute__((__vector_size__(16)));
 
-#include <f16cintrin.h>
-
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
 
+/// Adds lower double-precision values in both operands and returns the
+///    sum in the lower 64 bits of the result. The upper 64 bits of the result
+///    are copied from the upper double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
+///    from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_sd(__m128d __a, __m128d __b)
 {
@@ -56,12 +70,41 @@
   return __a;
 }
 
+/// Adds two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the sums of both
+///    operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v2df)__a + (__v2df)__b);
 }
 
+/// Subtracts the lower double-precision value of the second operand
+///    from the lower double-precision value of the first operand and returns
+///    the difference in the lower 64 bits of the result. The upper 64 bits of
+///    the result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    difference of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_sd(__m128d __a, __m128d __b)
 {
@@ -69,12 +112,40 @@
   return __a;
 }
 
+/// Subtracts two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the differences between
+///    both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v2df)__a - (__v2df)__b);
 }
 
+/// Multiplies lower double-precision values in both operands and returns
+///    the product in the lower 64 bits of the result. The upper 64 bits of the
+///    result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    product of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_sd(__m128d __a, __m128d __b)
 {
@@ -82,12 +153,41 @@
   return __a;
 }
 
+/// Multiplies two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the products of both
+///    operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v2df)__a * (__v2df)__b);
 }
 
+/// Divides the lower double-precision value of the first operand by the
+///    lower double-precision value of the second operand and returns the
+///    quotient in the lower 64 bits of the result. The upper 64 bits of the
+///    result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing divisor.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    quotient of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_sd(__m128d __a, __m128d __b)
 {
@@ -95,299 +195,1133 @@
   return __a;
 }
 
+/// Performs an element-by-element division of two 128-bit vectors of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the divisor.
+/// \returns A 128-bit vector of [2 x double] containing the quotients of both
+///    operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v2df)__a / (__v2df)__b);
 }
 
+/// Calculates the square root of the lower double-precision value of
+///    the second operand and returns it in the lower 64 bits of the result.
+///    The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    upper 64 bits of this operand are copied to the upper 64 bits of the
+///    result.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    square root is calculated using the lower 64 bits of this operand.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    square root of the lower 64 bits of operand \a __b, and whose upper 64
+///    bits are copied from the upper 64 bits of operand \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
-  return (__m128d) { __c[0], __a[1] };
+  return __extension__ (__m128d) { __c[0], __a[1] };
 }
 
+/// Calculates the square root of the each of two values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the square roots of the
+///    values in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_pd(__m128d __a)
 {
   return __builtin_ia32_sqrtpd((__v2df)__a);
 }
 
+/// Compares lower 64-bit double-precision values of both operands, and
+///    returns the lesser of the pair of values in the lower 64-bits of the
+///    result. The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    minimum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
 }
 
+/// Performs element-by-element comparison of the two 128-bit vectors of
+///    [2 x double] and returns the vector containing the lesser of each pair of
+///    values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the minimum values
+///    between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares lower 64-bit double-precision values of both operands, and
+///    returns the greater of the pair of values in the lower 64-bits of the
+///    result. The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    maximum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
 }
 
+/// Performs element-by-element comparison of the two 128-bit vectors of
+///    [2 x double] and returns the vector containing the greater of each pair
+///    of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the maximum values
+///    between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
 }
 
+/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_and_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4su)__a & (__v4su)__b);
+  return (__m128d)((__v2du)__a & (__v2du)__b);
 }
 
+/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values in the second operand and the one's complement of the first
+///    operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_andnot_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)(~(__v4su)__a & (__v4su)__b);
+  return (__m128d)(~(__v2du)__a & (__v2du)__b);
 }
 
+/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_or_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4su)__a | (__v4su)__b);
+  return (__m128d)((__v2du)__a | (__v2du)__b);
 }
 
+/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_xor_pd(__m128d __a, __m128d __b)
 {
-  return (__m128d)((__v4su)__a ^ (__v4su)__b);
+  return (__m128d)((__v2du)__a ^ (__v2du)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
+///    for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are less than those in the second operand. Each comparison
+///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are less than or equal to those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are greater than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are greater than or equal to those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are ordered with respect to those in the second operand.
+///
+///    A pair of double-precision values are "ordered" with respect to each
+///    other if neither value is a NaN. Each comparison yields 0x0 for false,
+///    0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are unordered with respect to those in the second operand.
+///
+///    A pair of double-precision values are "unordered" with respect to each
+///    other if one or both values are NaN. Each comparison yields 0x0 for
+///    false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are unequal to those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not less than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not less than or equal to those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not greater than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
 }
 
+/// Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not greater than or equal to those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
-  return (__m128d) { __c[0], __a[1] };
+  return __extension__ (__m128d) { __c[0], __a[1] };
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
-  return (__m128d) { __c[0], __a[1] };
+  return __extension__ (__m128d) { __c[0], __a[1] };
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is "ordered" with respect to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
+///    of double-precision values are "ordered" with respect to each other if
+///    neither value is a NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is "unordered" with respect to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
+///    of double-precision values are "unordered" with respect to each other if
+///    one or both values are NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not less than the corresponding
+///    value in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not less than or equal to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not greater than the corresponding
+///    value in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
-  return (__m128d) { __c[0], __a[1] };
+  return __extension__ (__m128d) { __c[0], __a[1] };
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not greater than or equal to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
-  return (__m128d) { __c[0], __a[1] };
+  return __extension__ (__m128d) { __c[0], __a[1] };
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0 for false, 1 for true.
+///
+///    If either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true.  If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
 }
 
+/// Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison result. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
 }
 
+/// Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two single-precision floating-point
+///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
+///    The upper 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpd_ps(__m128d __a)
 {
   return __builtin_ia32_cvtpd2ps((__v2df)__a);
 }
 
+/// Converts the lower two single-precision floating-point elements of a
+///    128-bit vector of [4 x float] into two double-precision floating-point
+///    values, returned in a 128-bit vector of [2 x double]. The upper two
+///    elements of the input vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower two single-precision
+///    floating-point elements are converted to double-precision values. The
+///    upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtps_pd(__m128 __a)
 {
@@ -395,6 +1329,22 @@
       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
 }
 
+/// Converts the lower two integer elements of a 128-bit vector of
+///    [4 x i32] into two double-precision floating-point values, returned in a
+///    128-bit vector of [2 x double].
+///
+///    The upper two elements of the input vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
+///    converted to double-precision values.
+///
+///    The upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtepi32_pd(__m128i __a)
 {
@@ -402,25 +1352,84 @@
       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
 }
 
+/// Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
+///    64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtpd_epi32(__m128d __a)
 {
   return __builtin_ia32_cvtpd2dq((__v2df)__a);
 }
 
+/// Converts the low-order element of a 128-bit vector of [2 x double]
+///    into a 32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsd_si32(__m128d __a)
 {
   return __builtin_ia32_cvtsd2si((__v2df)__a);
 }
 
+/// Converts the lower double-precision floating-point element of a
+///    128-bit vector of [2 x double], in the second parameter, into a
+///    single-precision floating-point value, returned in the lower 32 bits of a
+///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
+///    copied from the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
+///    copied to the upper 96 bits of the result.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision
+///    floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
+///    converted value from the second parameter. The upper 96 bits are copied
+///    from the upper 96 bits of the first parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsd_ss(__m128 __a, __m128d __b)
 {
-  __a[0] = __b[0];
-  return __a;
+  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
 }
 
+/// Converts a 32-bit signed integer value, in the second parameter, into
+///    a double-precision floating-point value, returned in the lower 64 bits of
+///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+///    are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+///    copied to the upper 64 bits of the result.
+/// \param __b
+///    A 32-bit signed integer containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+///    converted value from the second parameter. The upper 64 bits are copied
+///    from the upper 64 bits of the first parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtsi32_sd(__m128d __a, int __b)
 {
@@ -428,6 +1437,25 @@
   return __a;
 }
 
+/// Converts the lower single-precision floating-point element of a
+///    128-bit vector of [4 x float], in the second parameter, into a
+///    double-precision floating-point value, returned in the lower 64 bits of
+///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+///    are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+///    copied to the upper 64 bits of the result.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower single-precision
+///    floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+///    converted value from the second parameter. The upper 64 bits are copied
+///    from the upper 64 bits of the first parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtss_sd(__m128d __a, __m128 __b)
 {
@@ -435,48 +1463,147 @@
   return __a;
 }
 
+/// Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
+///
+///    If the result of either conversion is inexact, the result is truncated
+///    (rounded towards zero) regardless of the current MXCSR setting. The upper
+///    64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttpd_epi32(__m128d __a)
 {
   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
 }
 
+/// Converts the low-order element of a [2 x double] vector into a 32-bit
+///    signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttsd_si32(__m128d __a)
 {
-  return __a[0];
+  return __builtin_ia32_cvttsd2si((__v2df)__a);
 }
 
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+/// Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtpd_pi32(__m128d __a)
 {
   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
 }
 
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+/// Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in a 64-bit vector of [2 x i32].
+///
+///    If the result of either conversion is inexact, the result is truncated
+///    (rounded towards zero) regardless of the current MXCSR setting.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_cvttpd_pi32(__m128d __a)
 {
   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+/// Converts the two signed 32-bit integer elements of a 64-bit vector of
+///    [2 x i32] into two double-precision floating-point values, returned in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
 _mm_cvtpi32_pd(__m64 __a)
 {
   return __builtin_ia32_cvtpi2pd((__v2si)__a);
 }
 
+/// Returns the low-order element of a 128-bit vector of [2 x double] as
+///    a double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
+/// \returns A double-precision floating-point value copied from the lower 64
+///    bits of \a __a.
 static __inline__ double __DEFAULT_FN_ATTRS
 _mm_cvtsd_f64(__m128d __a)
 {
   return __a[0];
 }
 
+/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_pd(double const *__dp)
 {
   return *(__m128d*)__dp;
 }
 
+/// Loads a double-precision floating-point value from a specified memory
+///    location and duplicates it to both vector elements of a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location containing a double-precision value.
+/// \returns A 128-bit vector of [2 x double] containing the loaded and
+///    duplicated values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load1_pd(double const *__dp)
 {
@@ -484,11 +1611,25 @@
     double __u;
   } __attribute__((__packed__, __may_alias__));
   double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
-  return (__m128d){ __u, __u };
+  return __extension__ (__m128d){ __u, __u };
 }
 
 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
 
+/// Loads two double-precision values, in reverse order, from an aligned
+///    memory location into a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
+/// needed shuffling instructions. In AVX mode, the shuffling may be combined
+/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
+///
+/// \param __dp
+///    A 16-byte aligned pointer to an array of double-precision values to be
+///    loaded in reverse order.
+/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
+///    values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadr_pd(double const *__dp)
 {
@@ -496,6 +1637,17 @@
   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
 }
 
+/// Loads a 128-bit floating-point vector of [2 x double] from an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadu_pd(double const *__dp)
 {
@@ -505,6 +1657,17 @@
   return ((struct __loadu_pd*)__dp)->__v;
 }
 
+/// Loads a 64-bit integer value to the low element of a 128-bit integer
+///    vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A pointer to a 64-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadu_si64(void const *__a)
 {
@@ -512,9 +1675,20 @@
     long long __v;
   } __attribute__((__packed__, __may_alias__));
   long long __u = ((struct __loadu_si64*)__a)->__v;
-  return (__m128i){__u, 0L};
+  return __extension__ (__m128i)(__v2di){__u, 0L};
 }
 
+/// Loads a 64-bit double-precision value to the low element of a
+///    128-bit integer vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location containing a double-precision value.
+///    The address of the memory location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded value.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_sd(double const *__dp)
 {
@@ -522,9 +1696,26 @@
     double __u;
   } __attribute__((__packed__, __may_alias__));
   double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
-  return (__m128d){ __u, 0 };
+  return __extension__ (__m128d){ __u, 0 };
 }
 
+/// Loads a double-precision value into the high-order bits of a 128-bit
+///    vector of [2 x double]. The low-order bits are copied from the low-order
+///    bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the result.
+/// \param __dp
+///    A pointer to a 64-bit memory location containing a double-precision
+///    floating-point value that is loaded. The loaded value is written to bits
+///    [127:64] of the result. The address of the memory location does not have
+///    to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadh_pd(__m128d __a, double const *__dp)
 {
@@ -532,9 +1723,26 @@
     double __u;
   } __attribute__((__packed__, __may_alias__));
   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
-  return (__m128d){ __a[0], __u };
+  return __extension__ (__m128d){ __a[0], __u };
 }
 
+/// Loads a double-precision value into the low-order bits of a 128-bit
+///    vector of [2 x double]. The high-order bits are copied from the
+///    high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the result.
+/// \param __dp
+///    A pointer to a 64-bit memory location containing a double-precision
+///    floating-point value that is loaded. The loaded value is written to bits
+///    [63:0] of the result. The address of the memory location does not have to
+///    be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadl_pd(__m128d __a, double const *__dp)
 {
@@ -542,51 +1750,171 @@
     double __u;
   } __attribute__((__packed__, __may_alias__));
   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
-  return (__m128d){ __u, __a[1] };
+  return __extension__ (__m128d){ __u, __a[1] };
 }
 
+/// Constructs a 128-bit floating-point vector of [2 x double] with
+///    unspecified content. This could be used as an argument to another
+///    intrinsic function where the argument is required but the value is not
+///    actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
+///    content.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_undefined_pd(void)
 {
   return (__m128d)__builtin_ia32_undef128();
 }
 
+/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
+///    64 bits of the vector are initialized with the specified double-precision
+///    floating-point value. The upper 64 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
+///    lower 64 bits contain the value of the parameter. The upper 64 bits are
+///    set to zero.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set_sd(double __w)
 {
-  return (__m128d){ __w, 0 };
+  return __extension__ (__m128d){ __w, 0 };
 }
 
+/// Constructs a 128-bit floating-point vector of [2 x double], with each
+///    of the two double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set1_pd(double __w)
 {
-  return (__m128d){ __w, __w };
+  return __extension__ (__m128d){ __w, __w };
 }
 
+/// Constructs a 128-bit floating-point vector of [2 x double], with each
+///    of the two double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_pd1(double __w)
+{
+  return _mm_set1_pd(__w);
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double]
+///    initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the upper 64
+///    bits of the result.
+/// \param __x
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set_pd(double __w, double __x)
 {
-  return (__m128d){ __x, __w };
+  return __extension__ (__m128d){ __x, __w };
 }
 
+/// Constructs a 128-bit floating-point vector of [2 x double],
+///    initialized in reverse order with the specified double-precision
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \param __x
+///    A double-precision floating-point value used to initialize the upper 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_setr_pd(double __w, double __x)
 {
-  return (__m128d){ __w, __x };
+  return __extension__ (__m128d){ __w, __x };
 }
 
+/// Constructs a 128-bit floating-point vector of [2 x double]
+///    initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [2 x double] with
+///    all elements set to zero.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_setzero_pd(void)
 {
-  return (__m128d){ 0, 0 };
+  return __extension__ (__m128d){ 0, 0 };
 }
 
+/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
+///    64 bits are set to the lower 64 bits of the second parameter. The upper
+///    64 bits are set to the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
+///    upper 64 bits of the result.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
+///    lower 64 bits of the result.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_move_sd(__m128d __a, __m128d __b)
 {
-  return (__m128d){ __b[0], __a[1] };
+  __a[0] = __b[0];
+  return __a;
 }
 
+/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_sd(double *__dp, __m128d __a)
 {
@@ -596,12 +1924,39 @@
   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
 }
 
+/// Moves packed double-precision values from a 128-bit vector of
+///    [2 x double] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
+///
+/// \param __dp
+///    A pointer to an aligned memory location that can store two
+///    double-precision values.
+/// \param __a
+///    A packed 128-bit vector of [2 x double] containing the values to be
+///    moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_pd(double *__dp, __m128d __a)
 {
   *(__m128d*)__dp = __a;
 }
 
+/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
+///    the upper and lower 64 bits of a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the
+///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location that can store two double-precision
+///    values.
+/// \param __a
+///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
+///    of the values in \a __dp.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_pd(double *__dp, __m128d __a)
 {
@@ -609,12 +1964,38 @@
   _mm_store_pd(__dp, __a);
 }
 
+/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
+///    the upper and lower 64 bits of a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the
+///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location that can store two double-precision
+///    values.
+/// \param __a
+///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
+///    of the values in \a __dp.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_pd1(double *__dp, __m128d __a)
 {
-  return _mm_store1_pd(__dp, __a);
+  _mm_store1_pd(__dp, __a);
 }
 
+/// Stores a 128-bit vector of [2 x double] into an unaligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_pd(double *__dp, __m128d __a)
 {
@@ -624,6 +2005,20 @@
   ((struct __storeu_pd*)__dp)->__v = __a;
 }
 
+/// Stores two double-precision values, in reverse order, from a 128-bit
+///    vector of [2 x double] to a 16-byte aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to a shuffling instruction followed by a
+/// <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 16-byte aligned memory location that can store two
+///    double-precision values.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be reversed and
+///    stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storer_pd(double *__dp, __m128d __a)
 {
@@ -631,6 +2026,17 @@
   *(__m128d *)__dp = __a;
 }
 
+/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeh_pd(double *__dp, __m128d __a)
 {
@@ -640,6 +2046,17 @@
   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
 }
 
+/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_pd(double *__dp, __m128d __a)
 {
@@ -649,159 +2066,437 @@
   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
 }
 
+/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
+///    saving the lower 8 bits of each sum in the corresponding element of a
+///    128-bit result vector of [16 x i8].
+///
+///    The integer elements of both parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the sums of both
+///    parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qu)__a + (__v16qu)__b);
 }
 
+/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
+///    saving the lower 16 bits of each sum in the corresponding element of a
+///    128-bit result vector of [8 x i16].
+///
+///    The integer elements of both parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+/// \returns A 128-bit vector of [8 x i16] containing the sums of both
+///    parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hu)__a + (__v8hu)__b);
 }
 
+/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
+///    saving the lower 32 bits of each sum in the corresponding element of a
+///    128-bit result vector of [4 x i32].
+///
+///    The integer elements of both parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the sums of both
+///    parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4su)__a + (__v4su)__b);
 }
 
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+/// Adds two signed or unsigned 64-bit integer values, returning the
+///    lower 64 bits of the sum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer.
+/// \param __b
+///    A 64-bit integer.
+/// \returns A 64-bit integer containing the sum of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_add_si64(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
 }
 
+/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
+///    saving the lower 64 bits of each sum in the corresponding element of a
+///    128-bit result vector of [2 x i64].
+///
+///    The integer elements of both parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+/// \returns A 128-bit vector of [2 x i64] containing the sums of both
+///    parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v2du)__a + (__v2du)__b);
 }
 
+/// Adds, with saturation, the corresponding elements of two 128-bit
+///    signed [16 x i8] vectors, saving each sum in the corresponding element of
+///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
+///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [16 x i8] vector.
+/// \param __b
+///    A 128-bit signed [16 x i8] vector.
+/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
+///    both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// Adds, with saturation, the corresponding elements of two 128-bit
+///    signed [8 x i16] vectors, saving each sum in the corresponding element of
+///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
+///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
+///    0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
+///    both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// Adds, with saturation, the corresponding elements of two 128-bit
+///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
+///    are saturated to 0xFF. Negative sums are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
+///    of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// Adds, with saturation, the corresponding elements of two 128-bit
+///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
+///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
+///    of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// Computes the rounded avarages of corresponding elements of two
+///    128-bit unsigned [16 x i8] vectors, saving each result in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
+///    averages of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu8(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+  typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+  return (__m128i)__builtin_convertvector(
+               ((__builtin_convertvector((__v16qu)__a, __v16hu) +
+                 __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
+                 >> 1, __v16qu);
 }
 
+/// Computes the rounded avarages of corresponding elements of two
+///    128-bit unsigned [8 x i16] vectors, saving each result in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
+///    averages of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu16(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+  typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+  return (__m128i)__builtin_convertvector(
+               ((__builtin_convertvector((__v8hu)__a, __v8su) +
+                 __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
+                 >> 1, __v8hu);
 }
 
+/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, producing eight intermediate 32-bit signed integer products, and
+///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
+///    [4 x i32] vector.
+///
+///    For example, bits [15:0] of both parameters are multiplied producing a
+///    32-bit product, bits [31:16] of both parameters are multiplied producing
+///    a 32-bit product, and the sum of those two products becomes bits [31:0]
+///    of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
+///    of both parameters.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_madd_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// Compares corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, saving the greater value from each comparison in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
+///    each comparison.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
+///    vectors, saving the greater value from each comparison in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
+///    each comparison.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// Compares corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, saving the smaller value from each comparison in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
+///    each comparison.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
+///    vectors, saving the smaller value from each comparison in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
+///    each comparison.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// Multiplies the corresponding elements of two signed [8 x i16]
+///    vectors, saving the upper 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
+///    each of the eight 32-bit products.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// Multiplies the corresponding elements of two unsigned [8 x i16]
+///    vectors, saving the upper 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
+///    of each of the eight 32-bit products.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhi_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Multiplies the corresponding elements of two [8 x short] vectors and
-///    returns a vector containing the low-order 16 bits of each 32-bit product
-///    in the corresponding element.
+/// Multiplies the corresponding elements of two signed [8 x i16]
+///    vectors, saving the lower 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit signed [8 x i16] result vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
+/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
 ///
 /// \param __a
-///    A 128-bit integer vector containing one of the source operands.
+///    A 128-bit signed [8 x i16] vector.
 /// \param __b
-///    A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the products of both operands.
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
+///    each of the eight 32-bit products.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hu)__a * (__v8hu)__b);
 }
 
-/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
+/// Multiplies 32-bit unsigned integer values contained in the lower bits
 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
 ///    product.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMULUDQ instruction.
+/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer containing one of the source operands.
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_mul_su32(__m64 __a, __m64 __b)
 {
   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
 }
 
-/// \brief Multiplies 32-bit unsigned integer values contained in the lower
+/// Multiplies 32-bit unsigned integer values contained in the lower
 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
+/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
 ///
 /// \param __a
 ///    A [2 x i64] vector containing one of the source operands.
@@ -814,15 +2509,15 @@
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
-/// \brief Computes the absolute differences of corresponding 8-bit integer
+/// Computes the absolute differences of corresponding 8-bit integer
 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
-///    separately sums the second 8 absolute differences. Packss these two
+///    separately sums the second 8 absolute differences. Packs these two
 ///    unsigned 16-bit integer sums into the upper and lower elements of a
 ///    [2 x i64] vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
+/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing one of the source operands.
@@ -836,11 +2531,11 @@
   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
 }
 
-/// \brief Subtracts the corresponding 8-bit integer values in the operands.
+/// Subtracts the corresponding 8-bit integer values in the operands.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
+/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -854,11 +2549,11 @@
   return (__m128i)((__v16qu)__a - (__v16qu)__b);
 }
 
-/// \brief Subtracts the corresponding 16-bit integer values in the operands.
+/// Subtracts the corresponding 16-bit integer values in the operands.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
+/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -872,11 +2567,11 @@
   return (__m128i)((__v8hu)__a - (__v8hu)__b);
 }
 
-/// \brief Subtracts the corresponding 32-bit integer values in the operands.
+/// Subtracts the corresponding 32-bit integer values in the operands.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
+/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -890,12 +2585,12 @@
   return (__m128i)((__v4su)__a - (__v4su)__b);
 }
 
-/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
+/// Subtracts signed or unsigned 64-bit integer values and writes the
 ///    difference to the corresponding bits in the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBQ instruction.
+/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing the minuend.
@@ -903,17 +2598,17 @@
 ///    A 64-bit integer vector containing the subtrahend.
 /// \returns A 64-bit integer vector containing the difference of the values in
 ///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_sub_si64(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
 }
 
-/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
+/// Subtracts the corresponding elements of two [2 x i64] vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
+/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -927,14 +2622,14 @@
   return (__m128i)((__v2du)__a - (__v2du)__b);
 }
 
-/// \brief Subtracts corresponding 8-bit signed integer values in the input and
+/// Subtracts corresponding 8-bit signed integer values in the input and
 ///    returns the differences in the corresponding bytes in the destination.
-///    Differences greater than 7Fh are saturated to 7Fh, and differences less
-///    than 80h are saturated to 80h.
+///    Differences greater than 0x7F are saturated to 0x7F, and differences less
+///    than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
+/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -948,14 +2643,14 @@
   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
 }
 
-/// \brief Subtracts corresponding 16-bit signed integer values in the input and
+/// Subtracts corresponding 16-bit signed integer values in the input and
 ///    returns the differences in the corresponding bytes in the destination.
-///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
-///    than 8000h are saturated to 8000h.
+///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
+///    than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
+/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -969,13 +2664,13 @@
   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
+/// Subtracts corresponding 8-bit unsigned integer values in the input
 ///    and returns the differences in the corresponding bytes in the
-///    destination. Differences less than 00h are saturated to 00h.
+///    destination. Differences less than 0x00 are saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
+/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -989,13 +2684,13 @@
   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
 }
 
-/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
+/// Subtracts corresponding 16-bit unsigned integer values in the input
 ///    and returns the differences in the corresponding bytes in the
-///    destination. Differences less than 0000h are saturated to 0000h.
+///    destination. Differences less than 0x0000 are saturated to 0x0000.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
+/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the minuends.
@@ -1009,11 +2704,11 @@
   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Performs a bitwise AND of two 128-bit integer vectors.
+/// Performs a bitwise AND of two 128-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPAND / PAND instruction.
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing one of the source operands.
@@ -1027,12 +2722,12 @@
   return (__m128i)((__v2du)__a & (__v2du)__b);
 }
 
-/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
+/// Performs a bitwise AND of two 128-bit integer vectors, using the
 ///    one's complement of the values contained in the first source operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector containing the left source operand. The one's complement
@@ -1046,11 +2741,11 @@
 {
   return (__m128i)(~(__v2du)__a & (__v2du)__b);
 }
-/// \brief Performs a bitwise OR of two 128-bit integer vectors.
+/// Performs a bitwise OR of two 128-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPOR / POR instruction.
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing one of the source operands.
@@ -1064,11 +2759,11 @@
   return (__m128i)((__v2du)__a | (__v2du)__b);
 }
 
-/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
+/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing one of the source operands.
@@ -1082,7 +2777,7 @@
   return (__m128i)((__v2du)__a ^ (__v2du)__b);
 }
 
-/// \brief Left-shifts the 128-bit integer vector operand by the specified
+/// Left-shifts the 128-bit integer vector operand by the specified
 ///    number of bytes. Low-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
@@ -1091,50 +2786,32 @@
 /// __m128i _mm_slli_si128(__m128i a, const int imm);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
 ///
 /// \param a
 ///    A 128-bit integer vector containing the source operand.
 /// \param imm
-///    An immediate value specifying the number of bytes to left-shift
-///    operand a.
+///    An immediate value specifying the number of bytes to left-shift operand
+///    \a a.
 /// \returns A 128-bit integer vector containing the left-shifted value.
-#define _mm_slli_si128(a, imm) __extension__ ({                              \
-  (__m128i)__builtin_shufflevector(                                          \
-                                 (__v16qi)_mm_setzero_si128(),               \
-                                 (__v16qi)(__m128i)(a),                      \
-                                 ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
-                                 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
+#define _mm_slli_si128(a, imm) \
+  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
 
 #define _mm_bslli_si128(a, imm) \
-  _mm_slli_si128((a), (imm))
+  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
 
-/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+/// Left-shifts each 16-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to left-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi16(__m128i __a, int __count)
@@ -1142,18 +2819,18 @@
   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
 }
 
-/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+/// Left-shifts each 16-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand __a.
+///    to left-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi16(__m128i __a, __m128i __count)
@@ -1161,18 +2838,18 @@
   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
 }
 
-/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+/// Left-shifts each 32-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to left-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi32(__m128i __a, int __count)
@@ -1180,18 +2857,18 @@
   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
 }
 
-/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+/// Left-shifts each 32-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand __a.
+///    to left-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi32(__m128i __a, __m128i __count)
@@ -1199,18 +2876,18 @@
   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
 }
 
-/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+/// Left-shifts each 64-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to left-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi64(__m128i __a, int __count)
@@ -1218,18 +2895,18 @@
   return __builtin_ia32_psllqi128((__v2di)__a, __count);
 }
 
-/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+/// Left-shifts each 64-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand __a.
+///    to left-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the left-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi64(__m128i __a, __m128i __count)
@@ -1237,19 +2914,19 @@
   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
 }
 
-/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+/// Right-shifts each 16-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. High-order bits are filled with the sign
 ///    bit of the initial value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi16(__m128i __a, int __count)
@@ -1257,19 +2934,19 @@
   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
 }
 
-/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+/// Right-shifts each 16-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. High-order bits are filled with the sign
 ///    bit of the initial value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi16(__m128i __a, __m128i __count)
@@ -1277,19 +2954,19 @@
   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
 }
 
-/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+/// Right-shifts each 32-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. High-order bits are filled with the sign
 ///    bit of the initial value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi32(__m128i __a, int __count)
@@ -1297,19 +2974,19 @@
   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
 }
 
-/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+/// Right-shifts each 32-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. High-order bits are filled with the sign
 ///    bit of the initial value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi32(__m128i __a, __m128i __count)
@@ -1317,7 +2994,7 @@
   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
 }
 
-/// \brief Right-shifts the 128-bit integer vector operand by the specified
+/// Right-shifts the 128-bit integer vector operand by the specified
 ///    number of bytes. High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
@@ -1326,50 +3003,32 @@
 /// __m128i _mm_srli_si128(__m128i a, const int imm);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
 ///
 /// \param a
 ///    A 128-bit integer vector containing the source operand.
 /// \param imm
 ///    An immediate value specifying the number of bytes to right-shift operand
-///    a.
+///    \a a.
 /// \returns A 128-bit integer vector containing the right-shifted value.
-#define _mm_srli_si128(a, imm) __extension__ ({                              \
-  (__m128i)__builtin_shufflevector(                                          \
-                                 (__v16qi)(__m128i)(a),                      \
-                                 (__v16qi)_mm_setzero_si128(),               \
-                                 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
-                                 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
-                                 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
-                                 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
-                                 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
-                                 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
-                                 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
-                                 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
-                                 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
-                                 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
-                                 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
-                                 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
-                                 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
-                                 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
-                                 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
-                                 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
+#define _mm_srli_si128(a, imm) \
+  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
 
 #define _mm_bsrli_si128(a, imm) \
-  _mm_srli_si128((a), (imm))
+  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
 
-/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+/// Right-shifts each of 16-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi16(__m128i __a, int __count)
@@ -1377,18 +3036,18 @@
   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
 }
 
-/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+/// Right-shifts each of 16-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi16(__m128i __a, __m128i __count)
@@ -1396,18 +3055,18 @@
   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
 }
 
-/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+/// Right-shifts each of 32-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi32(__m128i __a, int __count)
@@ -1415,18 +3074,18 @@
   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
 }
 
-/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+/// Right-shifts each of 32-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi32(__m128i __a, __m128i __count)
@@ -1434,18 +3093,18 @@
   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
 }
 
-/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+/// Right-shifts each of 64-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    An integer value specifying the number of bits to right-shift each value
-///    in operand __a.
+///    in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi64(__m128i __a, int __count)
@@ -1453,18 +3112,18 @@
   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
 }
 
-/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+/// Right-shifts each of 64-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector containing the source operand.
 /// \param __count
 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand __a.
+///    to right-shift each value in operand \a __a.
 /// \returns A 128-bit integer vector containing the right-shifted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi64(__m128i __a, __m128i __count)
@@ -1472,13 +3131,13 @@
   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
 }
 
-/// \brief Compares each of the corresponding 8-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0h for false, FFh
+/// Compares each of the corresponding 8-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
 ///    for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1491,13 +3150,13 @@
   return (__m128i)((__v16qi)__a == (__v16qi)__b);
 }
 
-/// \brief Compares each of the corresponding 16-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
-///    for true.
+/// Compares each of the corresponding 16-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0x0 for false,
+///    0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1510,13 +3169,13 @@
   return (__m128i)((__v8hi)__a == (__v8hi)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0h for false,
-///    FFFFFFFFh for true.
+/// Compares each of the corresponding 32-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0x0 for false,
+///    0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1529,14 +3188,14 @@
   return (__m128i)((__v4si)__a == (__v4si)__b);
 }
 
-/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+/// Compares each of the corresponding signed 8-bit values of the 128-bit
 ///    integer vectors to determine if the values in the first operand are
-///    greater than those in the second operand. Each comparison yields 0h for
-///    false, FFh for true.
+///    greater than those in the second operand. Each comparison yields 0x0 for
+///    false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1551,14 +3210,15 @@
   return (__m128i)((__v16qs)__a > (__v16qs)__b);
 }
 
-/// \brief Compares each of the corresponding signed 16-bit values of the
+/// Compares each of the corresponding signed 16-bit values of the
 ///    128-bit integer vectors to determine if the values in the first operand
-///    are greater than those in the second operand. Each comparison yields 0h
-///    for false, FFFFh for true.
+///    are greater than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1571,14 +3231,15 @@
   return (__m128i)((__v8hi)__a > (__v8hi)__b);
 }
 
-/// \brief Compares each of the corresponding signed 32-bit values of the
+/// Compares each of the corresponding signed 32-bit values of the
 ///    128-bit integer vectors to determine if the values in the first operand
-///    are greater than those in the second operand. Each comparison yields 0h
-///    for false, FFFFFFFFh for true.
+///    are greater than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1591,14 +3252,15 @@
   return (__m128i)((__v4si)__a > (__v4si)__b);
 }
 
-/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+/// Compares each of the corresponding signed 8-bit values of the 128-bit
 ///    integer vectors to determine if the values in the first operand are less
-///    than those in the second operand. Each comparison yields 0h for false,
-///    FFh for true.
+///    than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1611,14 +3273,15 @@
   return _mm_cmpgt_epi8(__b, __a);
 }
 
-/// \brief Compares each of the corresponding signed 16-bit values of the
+/// Compares each of the corresponding signed 16-bit values of the
 ///    128-bit integer vectors to determine if the values in the first operand
-///    are less than those in the second operand. Each comparison yields 0h for
-///    false, FFFFh for true.
+///    are less than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1631,14 +3294,15 @@
   return _mm_cmpgt_epi16(__b, __a);
 }
 
-/// \brief Compares each of the corresponding signed 32-bit values of the
+/// Compares each of the corresponding signed 32-bit values of the
 ///    128-bit integer vectors to determine if the values in the first operand
-///    are less than those in the second operand. Each comparison yields 0h for
-///    false, FFFFFFFFh for true.
+///    are less than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1652,14 +3316,14 @@
 }
 
 #ifdef __x86_64__
-/// \brief Converts a 64-bit signed integer value from the second operand into a
+/// Converts a 64-bit signed integer value from the second operand into a
 ///    double-precision value and returns it in the lower element of a [2 x
 ///    double] vector; the upper element of the returned vector is copied from
 ///    the upper element of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
@@ -1676,12 +3340,12 @@
   return __a;
 }
 
-/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+/// Converts the first (lower) element of a vector of [2 x double] into a
 ///    64-bit signed integer value, according to the current rounding mode.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
@@ -1693,12 +3357,13 @@
   return __builtin_ia32_cvtsd2si64((__v2df)__a);
 }
 
-/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+/// Converts the first (lower) element of a vector of [2 x double] into a
 ///    64-bit signed integer value, truncating the result when it is inexact.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
@@ -1707,15 +3372,15 @@
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttsd_si64(__m128d __a)
 {
-  return __a[0];
+  return __builtin_ia32_cvttsd2si64((__v2df)__a);
 }
 #endif
 
-/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
+/// Converts a vector of [4 x i32] into a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector.
@@ -1723,14 +3388,14 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtepi32_ps(__m128i __a)
 {
-  return __builtin_ia32_cvtdq2ps((__v4si)__a);
+  return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
 }
 
-/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
+/// Converts a vector of [4 x float] into a vector of [4 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1742,12 +3407,13 @@
   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
 }
 
-/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
+/// Converts a vector of [4 x float] into a vector of [4 x i32],
 ///    truncating the result when it is inexact.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
+///   instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1755,15 +3421,15 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttps_epi32(__m128 __a)
 {
-  return (__m128i)__builtin_convertvector((__v4sf)__a, __v4si);
+  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
 }
 
-/// \brief Returns a vector of [4 x i32] where the lowest element is the input
+/// Returns a vector of [4 x i32] where the lowest element is the input
 ///    operand and the remaining elements are zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
 ///
 /// \param __a
 ///    A 32-bit signed integer operand.
@@ -1771,16 +3437,16 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi32_si128(int __a)
 {
-  return (__m128i)(__v4si){ __a, 0, 0, 0 };
+  return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
 }
 
 #ifdef __x86_64__
-/// \brief Returns a vector of [2 x i64] where the lower element is the input
+/// Returns a vector of [2 x i64] where the lower element is the input
 ///    operand and the upper element is zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit signed integer operand containing the value to be converted.
@@ -1788,16 +3454,16 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si128(long long __a)
 {
-  return (__m128i){ __a, 0 };
+  return __extension__ (__m128i)(__v2di){ __a, 0 };
 }
 #endif
 
-/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
+/// Moves the least significant 32 bits of a vector of [4 x i32] to a
 ///    32-bit signed integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
 ///
 /// \param __a
 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
@@ -1811,12 +3477,12 @@
 }
 
 #ifdef __x86_64__
-/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
+/// Moves the least significant 64 bits of a vector of [2 x i64] to a
 ///    64-bit signed integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
 ///
 /// \param __a
 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
@@ -1829,12 +3495,12 @@
 }
 #endif
 
-/// \brief Moves packed integer values from an aligned 128-bit memory location
+/// Moves packed integer values from an aligned 128-bit memory location
 ///    to elements in a 128-bit integer vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
+/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
 ///
 /// \param __p
 ///    An aligned pointer to a memory location containing integer values.
@@ -1845,12 +3511,12 @@
   return *__p;
 }
 
-/// \brief Moves packed integer values from an unaligned 128-bit memory location
+/// Moves packed integer values from an unaligned 128-bit memory location
 ///    to elements in a 128-bit integer vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
+/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a memory location containing integer values.
@@ -1864,12 +3530,12 @@
   return ((struct __loadu_si128*)__p)->__v;
 }
 
-/// \brief Returns a vector of [2 x i64] where the lower element is taken from
+/// Returns a vector of [2 x i64] where the lower element is taken from
 ///    the lower element of the operand, and the upper element is zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
 ///
 /// \param __p
 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
@@ -1882,10 +3548,10 @@
   struct __mm_loadl_epi64_struct {
     long long __u;
   } __attribute__((__packed__, __may_alias__));
-  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+  return __extension__ (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
 }
 
-/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
+/// Generates a 128-bit vector of [4 x i32] with unspecified content.
 ///    This could be used as an argument to another intrinsic function where the
 ///    argument is required but the value is not actually used.
 ///
@@ -1900,7 +3566,7 @@
   return (__m128i)__builtin_ia32_undef128();
 }
 
-/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
 ///    the specified 64-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1919,10 +3585,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi64x(long long __q1, long long __q0)
 {
-  return (__m128i){ __q0, __q1 };
+  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
 }
 
-/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
 ///    the specified 64-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1941,10 +3607,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi64(__m64 __q1, __m64 __q0)
 {
-  return (__m128i){ (long long)__q0, (long long)__q1 };
+  return _mm_set_epi64x((long long)__q1, (long long)__q0);
 }
 
-/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
+/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
 ///    the specified 32-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1969,10 +3635,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
 {
-  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+  return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
-/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
+/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
 ///    the specified 16-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -2009,10 +3675,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
 {
-  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+  return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
-/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
+/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
 ///    the specified 8-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -2057,10 +3723,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
 {
-  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+  return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
-/// \brief Initializes both values in a 128-bit integer vector with the
+/// Initializes both values in a 128-bit integer vector with the
 ///    specified 64-bit integer value.
 ///
 /// \headerfile <x86intrin.h>
@@ -2076,10 +3742,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64x(long long __q)
 {
-  return (__m128i){ __q, __q };
+  return _mm_set_epi64x(__q, __q);
 }
 
-/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
+/// Initializes both values in a 128-bit vector of [2 x i64] with the
 ///    specified 64-bit value.
 ///
 /// \headerfile <x86intrin.h>
@@ -2095,10 +3761,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64(__m64 __q)
 {
-  return (__m128i){ (long long)__q, (long long)__q };
+  return _mm_set_epi64(__q, __q);
 }
 
-/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
+/// Initializes all values in a 128-bit vector of [4 x i32] with the
 ///    specified 32-bit value.
 ///
 /// \headerfile <x86intrin.h>
@@ -2114,10 +3780,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi32(int __i)
 {
-  return (__m128i)(__v4si){ __i, __i, __i, __i };
+  return _mm_set_epi32(__i, __i, __i, __i);
 }
 
-/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
+/// Initializes all values in a 128-bit vector of [8 x i16] with the
 ///    specified 16-bit value.
 ///
 /// \headerfile <x86intrin.h>
@@ -2133,10 +3799,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi16(short __w)
 {
-  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
+  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
 }
 
-/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
+/// Initializes all values in a 128-bit vector of [16 x i8] with the
 ///    specified 8-bit value.
 ///
 /// \headerfile <x86intrin.h>
@@ -2152,45 +3818,172 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi8(char __b)
 {
-  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
+  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
 }
 
+/// Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __q0
+///    A 64-bit integral value used to initialize the lower 64 bits of the
+///    result.
+/// \param __q1
+///    A 64-bit integral value used to initialize the upper 64 bits of the
+///    result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi64(__m64 __q0, __m64 __q1)
 {
-  return (__m128i){ (long long)__q0, (long long)__q1 };
+  return _mm_set_epi64(__q1, __q0);
 }
 
+/// Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
 {
-  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+  return _mm_set_epi32(__i3, __i2, __i1, __i0);
 }
 
+/// Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w4
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w5
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w6
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w7
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
 {
-  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
 }
 
+/// Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b8
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b9
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
 {
-  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }
 
+/// Creates a 128-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit integer vector with all elements set to
+///    zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setzero_si128(void)
 {
-  return (__m128i){ 0LL, 0LL };
+  return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
 }
 
+/// Stores a 128-bit integer vector to a memory location aligned on a
+///    128-bit boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that will receive the integer
+///    values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_si128(__m128i *__p, __m128i __b)
 {
   *__p = __b;
 }
 
+/// Stores a 128-bit integer vector to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_si128(__m128i *__p, __m128i __b)
 {
@@ -2200,12 +3993,46 @@
   ((struct __storeu_si128*)__p)->__v = __b;
 }
 
+/// Moves bytes selected by the mask from the first operand to the
+///    specified unaligned memory location. When a mask bit is 1, the
+///    corresponding byte is written, otherwise it is not written.
+///
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon). Exception and trap behavior for elements not selected
+///    for storage to memory are implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
+///   instruction.
+///
+/// \param __d
+///    A 128-bit integer vector containing the values to be moved.
+/// \param __n
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each byte represents the mask bits.
+/// \param __p
+///    A pointer to an unaligned 128-bit memory location where the specified
+///    values are moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
 {
   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
 }
 
+/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
+///    a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the lower 64 bits
+///    of the integer vector parameter.
+/// \param __a
+///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
+///    value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_epi64(__m128i *__p, __m128i __a)
 {
@@ -2215,244 +4042,854 @@
   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
 }
 
+/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
+///    aligned memory location.
+///
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A vector of [2 x double] containing the 64-bit values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pd(double *__p, __m128d __a)
 {
   __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
 }
 
+/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
+///
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A 128-bit integer vector containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si128(__m128i *__p, __m128i __a)
 {
   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
+/// Stores a 32-bit integer value in the specified memory location.
+///
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
+///
+/// \param __p
+///    A pointer to the 32-bit memory location used to store the value.
+/// \param __a
+///    A 32-bit integer containing the value to be stored.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
 _mm_stream_si32(int *__p, int __a)
 {
   __builtin_ia32_movnti(__p, __a);
 }
 
 #ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
+/// Stores a 64-bit integer value in the specified memory location.
+///
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
+///
+/// \param __p
+///    A pointer to the 64-bit memory location used to store the value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
 _mm_stream_si64(long long *__p, long long __a)
 {
   __builtin_ia32_movnti64(__p, __a);
 }
 #endif
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflush(void const *__p)
-{
-  __builtin_ia32_clflush(__p);
-}
+#if defined(__cplusplus)
+extern "C" {
+#endif
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_lfence(void)
-{
-  __builtin_ia32_lfence();
-}
+/// The cache line containing \a __p is flushed and invalidated from all
+///    caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    flushed.
+void _mm_clflush(void const * __p);
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_mfence(void)
-{
-  __builtin_ia32_mfence();
-}
+/// Forces strong memory ordering (serialization) between load
+///    instructions preceding this instruction and load instructions following
+///    this instruction, ensuring the system completes all previous loads before
+///    executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
+///
+void _mm_lfence(void);
 
+/// Forces strong memory ordering (serialization) between load and store
+///    instructions preceding this instruction and load and store instructions
+///    following this instruction, ensuring that the system completes all
+///    previous memory accesses before executing subsequent memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
+///
+void _mm_mfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
+///
+/// \param __a
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the lower 64 bits of the result.
+/// \param __b
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
+/// Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit unsigned integers, and packs the results into the
+///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_extract_epi16(__m128i __a, int __imm)
-{
-  __v8hi __b = (__v8hi)__a;
-  return (unsigned short)__b[__imm & 7];
-}
+/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
+///    the immediate-value parameter as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __imm
+///    An immediate value. Bits [2:0] selects values from \a __a to be assigned
+///    to bits[15:0] of the result. \n
+///    000: assign values from bits [15:0] of \a __a. \n
+///    001: assign values from bits [31:16] of \a __a. \n
+///    010: assign values from bits [47:32] of \a __a. \n
+///    011: assign values from bits [63:48] of \a __a. \n
+///    100: assign values from bits [79:64] of \a __a. \n
+///    101: assign values from bits [95:80] of \a __a. \n
+///    110: assign values from bits [111:96] of \a __a. \n
+///    111: assign values from bits [127:112] of \a __a.
+/// \returns An integer, whose lower 16 bits are selected from the 128-bit
+///    integer vector parameter and the remaining bits are assigned zeros.
+#define _mm_extract_epi16(a, imm) \
+  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
+                                                   (int)(imm))
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_insert_epi16(__m128i __a, int __b, int __imm)
-{
-  __v8hi __c = (__v8hi)__a;
-  __c[__imm & 7] = __b;
-  return (__m128i)__c;
-}
+/// Constructs a 128-bit integer vector by first making a copy of the
+///    128-bit integer vector parameter, and then inserting the lower 16 bits
+///    of an integer parameter into an offset specified by the immediate-value
+///    parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
+///    result and then one of the eight elements in the result is replaced by
+///    the lower 16 bits of \a __b.
+/// \param __b
+///    An integer. The lower 16 bits of this parameter are written to the
+///    result beginning at an offset specified by \a __imm.
+/// \param __imm
+///    An immediate value specifying the bit offset in the result at which the
+///    lower 16 bits of \a __b are written.
+/// \returns A 128-bit integer vector containing the constructed values.
+#define _mm_insert_epi16(a, b, imm) \
+  (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
+                                       (int)(imm))
 
+/// Copies the values of the most significant bits from each 8-bit
+///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
+///    value, zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bits from each 8-bit element in \a __a,
+///    written to bits [15:0]. The other bits are assigned zeros.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_epi8(__m128i __a)
 {
   return __builtin_ia32_pmovmskb128((__v16qi)__a);
 }
 
-#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
-  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
-                                   (__v4si)_mm_undefined_si128(), \
-                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
-                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
+/// Constructs a 128-bit integer vector by shuffling four 32-bit
+///    elements of a 128-bit integer vector parameter, using the immediate-value
+///    parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a. The destinations within the 128-bit destination are assigned
+///    values as follows: \n
+///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
+///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
+///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
+///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [31:0] of \a a. \n
+///    01: assign values from bits [63:32] of \a a. \n
+///    10: assign values from bits [95:64] of \a a. \n
+///    11: assign values from bits [127:96] of \a a.
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shuffle_epi32(a, imm) \
+  (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
 
-#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
-  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
-                                   (__v8hi)_mm_undefined_si128(), \
-                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
-                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
-                                   4, 5, 6, 7); })
+/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
+///    [127:64] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from \a a. \n
+///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
+///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
+///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
+///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [15:0] of \a a. \n
+///    01: assign values from bits [31:16] of \a a. \n
+///    10: assign values from bits [47:32] of \a a. \n
+///    11: assign values from bits [63:48] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shufflelo_epi16(a, imm) \
+  (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
 
-#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
-  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
-                                   (__v8hi)_mm_undefined_si128(), \
-                                   0, 1, 2, 3, \
-                                   4 + (((imm) >> 0) & 0x3), \
-                                   4 + (((imm) >> 2) & 0x3), \
-                                   4 + (((imm) >> 4) & 0x3), \
-                                   4 + (((imm) >> 6) & 0x3)); })
+/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
+///    [63:0] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from \a a. \n
+///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
+///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
+///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
+///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [79:64] of \a a. \n
+///    01: assign values from bits [95:80] of \a a. \n
+///    10: assign values from bits [111:96] of \a a. \n
+///    11: assign values from bits [127:112] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shufflehi_epi16(a, imm) \
+  (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
 
+/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
+///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [7:0] of the result. \n
+///    Bits [79:72] are written to bits [23:16] of the result. \n
+///    Bits [87:80] are written to bits [39:32] of the result. \n
+///    Bits [95:88] are written to bits [55:48] of the result. \n
+///    Bits [103:96] are written to bits [71:64] of the result. \n
+///    Bits [111:104] are written to bits [87:80] of the result. \n
+///    Bits [119:112] are written to bits [103:96] of the result. \n
+///    Bits [127:120] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8]. \n
+///    Bits [71:64] are written to bits [15:8] of the result. \n
+///    Bits [79:72] are written to bits [31:24] of the result. \n
+///    Bits [87:80] are written to bits [47:40] of the result. \n
+///    Bits [95:88] are written to bits [63:56] of the result. \n
+///    Bits [103:96] are written to bits [79:72] of the result. \n
+///    Bits [111:104] are written to bits [95:88] of the result. \n
+///    Bits [119:112] are written to bits [111:104] of the result. \n
+///    Bits [127:120] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
+/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
+///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [15:0] of the result. \n
+///    Bits [95:80] are written to bits [47:32] of the result. \n
+///    Bits [111:96] are written to bits [79:64] of the result. \n
+///    Bits [127:112] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [31:16] of the result. \n
+///    Bits [95:80] are written to bits [63:48] of the result. \n
+///    Bits [111:96] are written to bits [95:80] of the result. \n
+///    Bits [127:112] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
 }
 
+/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [95:64] are written to bits [31:0] of the destination. \n
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [95:64] are written to bits [64:32] of the destination. \n
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
 }
 
+/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
 }
 
+/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8]. \n
+///    Bits [7:0] are written to bits [7:0] of the result. \n
+///    Bits [15:8] are written to bits [23:16] of the result. \n
+///    Bits [23:16] are written to bits [39:32] of the result. \n
+///    Bits [31:24] are written to bits [55:48] of the result. \n
+///    Bits [39:32] are written to bits [71:64] of the result. \n
+///    Bits [47:40] are written to bits [87:80] of the result. \n
+///    Bits [55:48] are written to bits [103:96] of the result. \n
+///    Bits [63:56] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result. \n
+///    Bits [15:8] are written to bits [31:24] of the result. \n
+///    Bits [23:16] are written to bits [47:40] of the result. \n
+///    Bits [31:24] are written to bits [63:56] of the result. \n
+///    Bits [39:32] are written to bits [79:72] of the result. \n
+///    Bits [47:40] are written to bits [95:88] of the result. \n
+///    Bits [55:48] are written to bits [111:104] of the result. \n
+///    Bits [63:56] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
 }
 
+/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
+///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
+///    [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result. \n
+///    Bits [31:16] are written to bits [47:32] of the result. \n
+///    Bits [47:32] are written to bits [79:64] of the result. \n
+///    Bits [63:48] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result. \n
+///    Bits [31:16] are written to bits [63:48] of the result. \n
+///    Bits [47:32] are written to bits [95:80] of the result. \n
+///    Bits [63:48] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
 }
 
+/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [31:0] are written to bits [31:0] of the destination. \n
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [31:0] are written to bits [64:32] of the destination. \n
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
 }
 
+/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [63:0] are written to bits [63:0] of the destination. \n
+/// \param __b
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [63:0] are written to bits [127:64] of the destination. \n
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
 }
 
+/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+///    integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_movepi64_pi64(__m128i __a)
 {
   return (__m64)__a[0];
 }
 
+/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+///    upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_movpi64_epi64(__m64 __a)
 {
-  return (__m128i){ (long long)__a, 0 };
+  return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
 }
 
+/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+///    integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_move_epi64(__m128i __a)
 {
-  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
+  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
 }
 
+/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
+///    [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpackhi_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
 }
 
+/// Unpacks the low-order 64-bit elements from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpacklo_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
 }
 
+/// Extracts the sign bits of the double-precision values in the 128-bit
+///    vector of [2 x double], zero-extends the value, and writes it to the
+///    low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values with sign bits to
+///    be extracted.
+/// \returns The sign bits from each of the double-precision elements in \a __a,
+///    written to bits [1:0]. The remaining bits are assigned values of zero.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pd(__m128d __a)
 {
   return __builtin_ia32_movmskpd((__v2df)__a);
 }
 
-#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
-  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
-                                   0 + (((i) >> 0) & 0x1), \
-                                   2 + (((i) >> 1) & 0x1)); })
 
+/// Constructs a 128-bit floating-point vector of [2 x double] from two
+///    128-bit vector parameters of [2 x double], using the immediate-value
+///     parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param i
+///    An 8-bit immediate value. The least significant two bits specify which
+///    elements to copy from \a a and \a b: \n
+///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
+///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
+///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
+///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
+#define _mm_shuffle_pd(a, b, i) \
+  (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                 (int)(i))
+
+/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castpd_ps(__m128d __a)
 {
   return (__m128)__a;
 }
 
+/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castpd_si128(__m128d __a)
 {
   return (__m128i)__a;
 }
 
+/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castps_pd(__m128 __a)
 {
   return (__m128d)__a;
 }
 
+/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castps_si128(__m128 __a)
 {
   return (__m128i)__a;
 }
 
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castsi128_ps(__m128i __a)
 {
   return (__m128)__a;
 }
 
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castsi128_pd(__m128i __a)
 {
   return (__m128d)__a;
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_pause(void)
-{
-  __builtin_ia32_pause();
-}
+#if defined(__cplusplus)
+extern "C" {
+#endif
 
+/// Indicates that a spin loop is being executed for the purposes of
+///    optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
+///
+void _mm_pause(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_MMX
 
 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
 
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
 #endif /* __EMMINTRIN_H */
diff --git a/darwin-x86/clang-headers/f16cintrin.h b/darwin-x86/clang-headers/f16cintrin.h
index 415bf73..3d35f28 100644
--- a/darwin-x86/clang-headers/f16cintrin.h
+++ b/darwin-x86/clang-headers/f16cintrin.h
@@ -21,28 +21,35 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
+#if !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
 #endif
 
 #ifndef __F16CINTRIN_H
 #define __F16CINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+#define __DEFAULT_FN_ATTRS128 \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
 
-/// \brief Converts a 16-bit half-precision float value into a 32-bit float
+/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
+ * but that's because icc can emulate these without f16c using a library call.
+ * Since we don't do that let's leave these in f16cintrin.h.
+ */
+
+/// Converts a 16-bit half-precision float value into a 32-bit float
 ///    value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPH2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
 ///
 /// \param __a
 ///    A 16-bit half-precision float value.
 /// \returns The converted 32-bit float value.
-static __inline float __DEFAULT_FN_ATTRS
+static __inline float __DEFAULT_FN_ATTRS128
 _cvtsh_ss(unsigned short __a)
 {
   __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
@@ -50,7 +57,7 @@
   return r[0];
 }
 
-/// \brief Converts a 32-bit single-precision float value to a 16-bit
+/// Converts a 32-bit single-precision float value to a 16-bit
 ///    half-precision float value.
 ///
 /// \headerfile <x86intrin.h>
@@ -59,24 +66,24 @@
 /// unsigned short _cvtss_sh(float a, const int imm);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCVTPS2PH instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
 ///
 /// \param a
 ///    A 32-bit single-precision float value to be converted to a 16-bit
 ///    half-precision float value.
 /// \param imm
-///    An immediate value controlling rounding using bits [2:0]:
-///    000: Nearest
-///    001: Down
-///    010: Up
-///    011: Truncate
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns The converted 16-bit half-precision float value.
-#define _cvtss_sh(a, imm)  \
-  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                      (imm)))[0]))
+#define _cvtss_sh(a, imm) \
+  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                     (imm)))[0])
 
-/// \brief Converts a 128-bit vector containing 32-bit float values into a
+/// Converts a 128-bit vector containing 32-bit float values into a
 ///    128-bit vector containing 16-bit half-precision float values.
 ///
 /// \headerfile <x86intrin.h>
@@ -85,40 +92,85 @@
 /// __m128i _mm_cvtps_ph(__m128 a, const int imm);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VCVTPS2PH instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector containing 32-bit float values.
 /// \param imm
-///    An immediate value controlling rounding using bits [2:0]:
-///    000: Nearest
-///    001: Down
-///    010: Up
-///    011: Truncate
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns A 128-bit vector containing converted 16-bit half-precision float
 ///    values. The lower 64 bits are used to store the converted 16-bit
 ///    half-precision floating-point values.
 #define _mm_cvtps_ph(a, imm) \
-  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
+  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))
 
-/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+/// Converts a 128-bit vector containing 16-bit half-precision float
 ///    values into a 128-bit vector containing 32-bit float values.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTPH2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector containing 16-bit half-precision float values. The lower
 ///    64 bits are used in the conversion.
 /// \returns A 128-bit vector of [4 x float] containing converted float values.
-static __inline __m128 __DEFAULT_FN_ATTRS
+static __inline __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtph_ps(__m128i __a)
 {
   return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
 }
 
-#undef __DEFAULT_FN_ATTRS
+/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
+///    containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 256-bit vector containing 32-bit single-precision float values to be
+///    converted to 16-bit half-precision float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+///    float values.
+#define _mm256_cvtps_ph(a, imm) \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
+
+/// Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values to be
+///    converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+///    single-precision float values.
+static __inline __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif /* __F16CINTRIN_H */
diff --git a/darwin-x86/clang-headers/float.h b/darwin-x86/clang-headers/float.h
index a28269e..56215cd 100644
--- a/darwin-x86/clang-headers/float.h
+++ b/darwin-x86/clang-headers/float.h
@@ -21,15 +21,27 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __FLOAT_H
-#define __FLOAT_H
+#ifndef __CLANG_FLOAT_H
+#define __CLANG_FLOAT_H
 
 /* If we're on MinGW, fall back to the system's float.h, which might have
  * additional definitions provided for Windows.
  * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ *
+ * Also fall back on Darwin to allow additional definitions and
+ * implementation-defined values.
  */
-#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \
-    __has_include_next(<float.h>)
+#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
+    __STDC_HOSTED__ && __has_include_next(<float.h>)
+
+/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
+ * of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this
+ * extra indirection.
+ */
+#ifdef __APPLE__
+#define _FLOAT_H_
+#endif
+
 #  include_next <float.h>
 
 /* Undefine anything that we'll be redefining below. */
@@ -73,6 +85,9 @@
 #    undef FLT_DECIMAL_DIG
 #    undef DBL_DECIMAL_DIG
 #    undef LDBL_DECIMAL_DIG
+#    undef FLT_HAS_SUBNORM
+#    undef DBL_HAS_SUBNORM
+#    undef LDBL_HAS_SUBNORM
 #  endif
 #endif
 
@@ -129,6 +144,23 @@
 #  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
 #  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
 #  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
+#  define FLT_HAS_SUBNORM __FLT_HAS_DENORM__
+#  define DBL_HAS_SUBNORM __DBL_HAS_DENORM__
+#  define LDBL_HAS_SUBNORM __LDBL_HAS_DENORM__
 #endif
 
-#endif /* __FLOAT_H */
+#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
+#  define FLT16_MANT_DIG    __FLT16_MANT_DIG__
+#  define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
+#  define FLT16_DIG         __FLT16_DIG__
+#  define FLT16_MIN_EXP     __FLT16_MIN_EXP__
+#  define FLT16_MIN_10_EXP  __FLT16_MIN_10_EXP__
+#  define FLT16_MAX_EXP     __FLT16_MAX_EXP__
+#  define FLT16_MAX_10_EXP  __FLT16_MAX_10_EXP__
+#  define FLT16_MAX         __FLT16_MAX__
+#  define FLT16_EPSILON     __FLT16_EPSILON__
+#  define FLT16_MIN         __FLT16_MIN__
+#  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
+#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
+
+#endif /* __CLANG_FLOAT_H */
diff --git a/darwin-x86/clang-headers/fma4intrin.h b/darwin-x86/clang-headers/fma4intrin.h
index 11aa8ce..7bae2f4 100644
--- a/darwin-x86/clang-headers/fma4intrin.h
+++ b/darwin-x86/clang-headers/fma4intrin.h
@@ -31,200 +31,202 @@
 #include <pmmintrin.h>
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif /* __FMA4INTRIN_H */
diff --git a/darwin-x86/clang-headers/fmaintrin.h b/darwin-x86/clang-headers/fmaintrin.h
index 0e2ef0b..094d13a 100644
--- a/darwin-x86/clang-headers/fmaintrin.h
+++ b/darwin-x86/clang-headers/fmaintrin.h
@@ -1,4 +1,4 @@
-/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -29,200 +29,202 @@
 #define __FMAINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
 
 #endif /* __FMAINTRIN_H */
diff --git a/darwin-x86/clang-headers/fxsrintrin.h b/darwin-x86/clang-headers/fxsrintrin.h
index ac6026a..704b5ad 100644
--- a/darwin-x86/clang-headers/fxsrintrin.h
+++ b/darwin-x86/clang-headers/fxsrintrin.h
@@ -30,25 +30,75 @@
 
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
 
+/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave(void *__p) {
-  return __builtin_ia32_fxsave(__p);
+_fxsave(void *__p)
+{
+  __builtin_ia32_fxsave(__p);
 }
 
+/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave64(void *__p) {
-  return __builtin_ia32_fxsave64(__p);
+_fxrstor(void *__p)
+{
+  __builtin_ia32_fxrstor(__p);
 }
 
+#ifdef __x86_64__
+/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor(void *__p) {
-  return __builtin_ia32_fxrstor(__p);
+_fxsave64(void *__p)
+{
+  __builtin_ia32_fxsave64(__p);
 }
 
+/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
 static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor64(void *__p) {
-  return __builtin_ia32_fxrstor64(__p);
+_fxrstor64(void *__p)
+{
+  __builtin_ia32_fxrstor64(__p);
 }
+#endif
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/darwin-x86/clang-headers/gfniintrin.h b/darwin-x86/clang-headers/gfniintrin.h
new file mode 100644
index 0000000..804d4f3
--- /dev/null
+++ b/darwin-x86/clang-headers/gfniintrin.h
@@ -0,0 +1,208 @@
+/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __GFNIINTRIN_H
+#define __GFNIINTRIN_H
+
+
+#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
+  (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
+                                                  (__v16qi)(__m128i)(B),          \
+                                                  (char)(I))
+
+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
+        (__v16qi)(__m128i)(S))
+
+
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
+        U, A, B, I)
+
+
+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
+  (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
+                                                  (__v32qi)(__m256i)(B),          \
+                                                  (char)(I))
+
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
+        (__v32qi)(__m256i)(S))
+
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+        U, A, B, I)
+
+
+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
+  (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
+                                                  (__v64qi)(__m512i)(B),          \
+                                                  (char)(I))
+
+#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
+        (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I),                       \
+        (__v64qi)(__m512i)(S))
+
+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(),    \
+        U, A, B, I)
+
+#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
+  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
+                                                  (__v16qi)(__m128i)(B),          \
+                                                  (char)(I))
+
+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
+        (__v16qi)(__m128i)(S))
+
+
+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
+        U, A, B, I)
+
+
+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
+  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
+                                                  (__v32qi)(__m256i)(B),          \
+                                                  (char)(I))
+
+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
+        (__v32qi)(__m256i)(S))
+
+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
+        U, A, B, I)
+
+
+#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
+  (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
+                                                  (__v64qi)(__m512i)(B),          \
+                                                  (char)(I))
+
+#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
+        (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I),                          \
+        (__v64qi)(__m512i)(S))
+
+#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(),       \
+        U, A, B, I)
+
+/* Default attributes for simple form (no masking). */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
+
+/* Default attributes for YMM unmasked form. */
+#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
+
+/* Default attributes for VLX forms. */
+#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+              (__v16qi) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128(__U,
+              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
+              (__v16qi) __S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
+              __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
+_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
+              (__v32qi) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256(__U,
+              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
+              (__v32qi) __S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
+              __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
+              (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectb_512(__U,
+              (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
+              (__v64qi) __S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
+              __U, __A, __B);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_Y
+#undef __DEFAULT_FN_ATTRS_Z
+#undef __DEFAULT_FN_ATTRS_VL128
+#undef __DEFAULT_FN_ATTRS_VL256
+
+#endif /* __GFNIINTRIN_H */
+
diff --git a/darwin-x86/clang-headers/htmxlintrin.h b/darwin-x86/clang-headers/htmxlintrin.h
index 16dc705..049dbd6 100644
--- a/darwin-x86/clang-headers/htmxlintrin.h
+++ b/darwin-x86/clang-headers/htmxlintrin.h
@@ -35,14 +35,10 @@
 extern "C" {
 #endif
 
-#define _TEXASR_PTR(TM_BUF) \
-  ((texasr_t *)((TM_BUF)+0))
-#define _TEXASRU_PTR(TM_BUF) \
-  ((texasru_t *)((TM_BUF)+0))
-#define _TEXASRL_PTR(TM_BUF) \
-  ((texasrl_t *)((TM_BUF)+4))
-#define _TFIAR_PTR(TM_BUF) \
-  ((tfiar_t *)((TM_BUF)+8))
+#define _TEXASR_PTR(TM_BUF) ((texasr_t *)((char *)(TM_BUF) + 0))
+#define _TEXASRU_PTR(TM_BUF) ((texasru_t *)((char *)(TM_BUF) + 0))
+#define _TEXASRL_PTR(TM_BUF) ((texasrl_t *)((char *)(TM_BUF) + 4))
+#define _TFIAR_PTR(TM_BUF) ((tfiar_t *)((char *)(TM_BUF) + 8))
 
 typedef char TM_buff_type[16];
 
@@ -178,7 +174,7 @@
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 __TM_is_conflict(void* const __TM_buff)
 {
-  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
   /* Return TEXASR bits 11 (Self-Induced Conflict) through
      14 (Translation Invalidation Conflict).  */
   return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
@@ -218,7 +214,7 @@
 
 /* These intrinsics are being made available for compatibility with
    the IBM XL compiler.  For documentation please see the "z/OS XL
-   C/C++ Programming Guide" publically available on the web.  */
+   C/C++ Programming Guide" publicly available on the web.  */
 
 static __inline long __attribute__((__always_inline__, __nodebug__))
 __TM_simple_begin ()
diff --git a/darwin-x86/clang-headers/ia32intrin.h b/darwin-x86/clang-headers/ia32intrin.h
index 397f3fd..f8972e3 100644
--- a/darwin-x86/clang-headers/ia32intrin.h
+++ b/darwin-x86/clang-headers/ia32intrin.h
@@ -60,12 +60,6 @@
   return __builtin_ia32_rdpmc(__A);
 }
 
-/* __rdtsc */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
-__rdtsc(void) {
-  return __builtin_ia32_rdtsc();
-}
-
 /* __rdtscp */
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __rdtscp(unsigned int *__A) {
@@ -76,4 +70,9 @@
 
 #define _rdpmc(A) __rdpmc(A)
 
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_wbinvd(void) {
+  __builtin_ia32_wbinvd();
+}
+
 #endif /* __IA32INTRIN_H */
diff --git a/darwin-x86/clang-headers/immintrin.h b/darwin-x86/clang-headers/immintrin.h
index 4b27523..e7bfbf9 100644
--- a/darwin-x86/clang-headers/immintrin.h
+++ b/darwin-x86/clang-headers/immintrin.h
@@ -58,26 +58,25 @@
 #include <clflushoptintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__)
+#include <clwbintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
 #include <avxintrin.h>
 #endif
 
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
 #include <avx2intrin.h>
+#endif
 
-/* The 256-bit versions of functions in f16cintrin.h.
-   Intel documents these as being in immintrin.h, and
-   they depend on typedefs from avxintrin.h. */
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
+#include <f16cintrin.h>
+#endif
 
-#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
-
-static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
-_mm256_cvtph_ps(__m128i __a)
-{
-  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
-}
-#endif /* __AVX2__ */
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
+#include <vpclmulqdqintrin.h>
+#endif
 
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
 #include <bmiintrin.h>
@@ -91,6 +90,10 @@
 #include <lzcntintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
+#include <popcntintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
 #include <fmaintrin.h>
 #endif
@@ -107,15 +110,42 @@
 #include <avx512bwintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__)
+#include <avx512bitalgintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
 #include <avx512cdintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
+#include <avx512vpopcntdqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
+#include <avx512vpopcntdqvlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__)
+#include <avx512vnniintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
+#include <avx512vlvnniintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
 #endif
 
 #if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
+#include <avx512vlbitalgintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
     (defined(__AVX512VL__) && defined(__AVX512BW__))
 #include <avx512vlbwintrin.h>
 #endif
@@ -152,6 +182,15 @@
 #include <avx512vbmivlintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__)
+#include <avx512vbmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
+#include <avx512vlvbmi2intrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
 #include <avx512pfintrin.h>
 #endif
@@ -160,6 +199,26 @@
 #include <pkuintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__)
+#include <vaesintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__)
+#include <gfniintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDPID__)
+/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDPID </c> instruction.
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
+_rdpid_u32(void) {
+  return __builtin_ia32_rdpid();
+}
+#endif // __RDPID__
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
@@ -173,6 +232,15 @@
   return __builtin_ia32_rdrand32_step(__p);
 }
 
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
 /* __bit_scan_forward */
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _bit_scan_forward(int __A) {
@@ -185,15 +253,6 @@
   return 31 - __builtin_clz(__A);
 }
 
-#ifdef __x86_64__
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
-  return __builtin_ia32_rdrand64_step(__p);
-}
-#endif
-#endif /* __RDRND__ */
-
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
@@ -223,25 +282,25 @@
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u32(unsigned int __V)
 {
-  return __builtin_ia32_wrfsbase32(__V);
+  __builtin_ia32_wrfsbase32(__V);
 }
 
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u64(unsigned long long __V)
 {
-  return __builtin_ia32_wrfsbase64(__V);
+  __builtin_ia32_wrfsbase64(__V);
 }
 
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u32(unsigned int __V)
 {
-  return __builtin_ia32_wrgsbase32(__V);
+  __builtin_ia32_wrgsbase32(__V);
 }
 
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u64(unsigned long long __V)
 {
-  return __builtin_ia32_wrgsbase64(__V);
+  __builtin_ia32_wrgsbase64(__V);
 }
 
 #endif
@@ -276,8 +335,133 @@
 #include <xsavesintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__)
+#include <cetintrin.h>
+#endif
+
 /* Some intrinsics inside adxintrin.h are available only on processors with ADX,
  * whereas others are also available at all times. */
 #include <adxintrin.h>
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
+#include <rdseedintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WBNOINVD__)
+#include <wbnoinvdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLDEMOTE__)
+#include <cldemoteintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WAITPKG__)
+#include <waitpkgintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+  defined(__MOVDIRI__) || defined(__MOVDIR64B__)
+#include <movdirintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PCONFIG__)
+#include <pconfigintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SGX__)
+#include <sgxintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PTWRITE__)
+#include <ptwriteintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__INVPCID__)
+#include <invpcidintrin.h>
+#endif
+
+#ifdef _MSC_VER
+/* Define the default attributes for these intrinsics */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange HLE
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange HLE
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _MSC_VER */
+
 #endif /* __IMMINTRIN_H */
diff --git a/darwin-x86/clang-headers/intrin.h b/darwin-x86/clang-headers/intrin.h
index f18711a..9191421 100644
--- a/darwin-x86/clang-headers/intrin.h
+++ b/darwin-x86/clang-headers/intrin.h
@@ -34,6 +34,14 @@
 #include <x86intrin.h>
 #endif
 
+#if defined(__arm__)
+#include <armintr.h>
+#endif
+
+#if defined(__aarch64__)
+#include <arm64intr.h>
+#endif
+
 /* For the definition of jmp_buf. */
 #if __STDC_HOSTED__
 #include <setjmp.h>
@@ -61,10 +69,10 @@
 void __cpuid(int[4], int);
 static __inline__
 void __cpuidex(int[4], int, int);
-void __debugbreak(void);
+static __inline__
 __int64 __emul(int, int);
+static __inline__
 unsigned __int64 __emulu(unsigned int, unsigned int);
-void __cdecl __fastfail(unsigned int);
 unsigned int __getcallerseflags(void);
 static __inline__
 void __halt(void);
@@ -82,9 +90,6 @@
 void __lidt(void *);
 unsigned __int64 __ll_lshift(unsigned __int64, int);
 __int64 __ll_rshift(__int64, int);
-void __llwpcb(void *);
-unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
-void __lwpval32(unsigned int, unsigned int, unsigned int);
 unsigned int __lzcnt(unsigned int);
 unsigned short __lzcnt16(unsigned short);
 static __inline__
@@ -93,6 +98,7 @@
 void __movsd(unsigned long *, unsigned long const *, size_t);
 static __inline__
 void __movsw(unsigned short *, unsigned short const *, size_t);
+static __inline__
 void __nop(void);
 void __nvreg_restore_fence(void);
 void __nvreg_save_fence(void);
@@ -102,10 +108,6 @@
 void __outdwordstring(unsigned short, unsigned long *, unsigned long);
 void __outword(unsigned short, unsigned short);
 void __outwordstring(unsigned short, unsigned short *, unsigned long);
-static __inline__
-unsigned int __popcnt(unsigned int);
-static __inline__
-unsigned short __popcnt16(unsigned short);
 unsigned long __readcr0(void);
 unsigned long __readcr2(void);
 static __inline__
@@ -117,8 +119,6 @@
 static __inline__
 unsigned char __readfsbyte(unsigned long);
 static __inline__
-unsigned long __readfsdword(unsigned long);
-static __inline__
 unsigned __int64 __readfsqword(unsigned long);
 static __inline__
 unsigned short __readfsword(unsigned long);
@@ -128,7 +128,6 @@
 unsigned __int64 __readpmc(unsigned long);
 unsigned long __segmentlimit(unsigned long);
 void __sidt(void *);
-void *__slwpcb(void);
 static __inline__
 void __stosb(unsigned char *, unsigned char, size_t);
 static __inline__
@@ -164,114 +163,31 @@
 unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
 static __inline__
 unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
-static __inline__
 unsigned char _bittest(long const *, long);
-static __inline__
 unsigned char _bittestandcomplement(long *, long);
-static __inline__
 unsigned char _bittestandreset(long *, long);
-static __inline__
 unsigned char _bittestandset(long *, long);
-unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
-unsigned long __cdecl _byteswap_ulong(unsigned long);
-unsigned short __cdecl _byteswap_ushort(unsigned short);
 void __cdecl _disable(void);
 void __cdecl _enable(void);
 long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
-static __inline__
-long _InterlockedAnd(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedAnd16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedAnd8(char volatile *_Value, char _Mask);
 unsigned char _interlockedbittestandreset(long volatile *, long);
-static __inline__
 unsigned char _interlockedbittestandset(long volatile *, long);
-static __inline__
-long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
-                                         long _Exchange, long _Comparand);
-long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
-long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
-static __inline__
-short _InterlockedCompareExchange16(short volatile *_Destination,
-                                    short _Exchange, short _Comparand);
-static __inline__
-__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
-                                      __int64 _Exchange, __int64 _Comparand);
-__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
-                                                 __int64);
-__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
-                                                 __int64);
-static __inline__
-char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
-                                  char _Comparand);
 void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
                                                     void *);
 void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
                                                     void *);
-static __inline__
-long __cdecl _InterlockedDecrement(long volatile *_Addend);
-static __inline__
-short _InterlockedDecrement16(short volatile *_Addend);
-long _InterlockedExchange(long volatile *_Target, long _Value);
-static __inline__
-short _InterlockedExchange16(short volatile *_Target, short _Value);
-static __inline__
-char _InterlockedExchange8(char volatile *_Target, char _Value);
-static __inline__
-long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
 long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
 long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
-static __inline__
-short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
-static __inline__
-char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
-static __inline__
-long __cdecl _InterlockedIncrement(long volatile *_Addend);
-static __inline__
-short _InterlockedIncrement16(short volatile *_Addend);
-static __inline__
-long _InterlockedOr(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedOr16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedOr8(char volatile *_Value, char _Mask);
-static __inline__
-long _InterlockedXor(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedXor16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedXor8(char volatile *_Value, char _Mask);
 void __cdecl _invpcid(unsigned int, void *);
-static __inline__
-unsigned long __cdecl _lrotl(unsigned long, int);
-static __inline__
-unsigned long __cdecl _lrotr(unsigned long, int);
-static __inline__
-void _ReadBarrier(void);
-static __inline__
-void _ReadWriteBarrier(void);
-static __inline__
-void *_ReturnAddress(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
-static __inline__
-unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
-static __inline__
-unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
-static __inline__
-unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
-static __inline__
-unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
-static __inline__
-unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
-static __inline__
-unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
-static __inline__
-unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
-static __inline__
-unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
 int __cdecl _setjmp(jmp_buf);
@@ -281,8 +197,9 @@
 void _Store_HLERelease(long volatile *, long);
 void _Store64_HLERelease(__int64 volatile *, __int64);
 void _StorePointer_HLERelease(void *volatile *, void *);
-static __inline__
-void _WriteBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void);
 unsigned __int32 xbegin(void);
 void _xend(void);
 static __inline__
@@ -302,14 +219,9 @@
 void __incgsdword(unsigned long);
 void __incgsqword(unsigned long);
 void __incgsword(unsigned long);
-unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
-void __lwpval64(unsigned __int64, unsigned int, unsigned int);
 unsigned __int64 __lzcnt64(unsigned __int64);
 static __inline__
 void __movsq(unsigned long long *, unsigned long long const *, size_t);
-__int64 __mulh(__int64, __int64);
-static __inline__
-unsigned __int64 __popcnt64(unsigned __int64);
 static __inline__
 unsigned char __readgsbyte(unsigned long);
 static __inline__
@@ -336,25 +248,15 @@
 void __writegsdword(unsigned long, unsigned long);
 void __writegsqword(unsigned long, unsigned __int64);
 void __writegsword(unsigned long, unsigned short);
-static __inline__
-unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
-unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
 unsigned char _bittest64(__int64 const *, __int64);
-static __inline__
 unsigned char _bittestandcomplement64(__int64 *, __int64);
-static __inline__
 unsigned char _bittestandreset64(__int64 *, __int64);
-static __inline__
 unsigned char _bittestandset64(__int64 *, __int64);
-unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
 long _InterlockedAnd_np(long volatile *_Value, long _Mask);
 short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
 __int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
 char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
 unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
-static __inline__
 unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
 long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
                                     long _Comparand);
@@ -368,531 +270,573 @@
                                                 __int64 *_ComparandResult);
 short _InterlockedCompareExchange16_np(short volatile *_Destination,
                                        short _Exchange, short _Comparand);
-__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
-                                                 __int64);
-__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
-                                                 __int64);
 __int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
                                          __int64 _Exchange, __int64 _Comparand);
-void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
-                                         void *_Exchange, void *_Comparand);
 void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
                                             void *_Exchange, void *_Comparand);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
+static __inline__
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
+static __inline__
+__int64 _mul128(__int64, __int64, __int64*);
+static __inline__
+unsigned __int64 _umul128(unsigned __int64,
+                          unsigned __int64,
+                          unsigned __int64*);
+
+#endif /* __x86_64__ */
+
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
+
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+
 static __inline__
 __int64 _InterlockedDecrement64(__int64 volatile *_Addend);
 static __inline__
 __int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
 static __inline__
 __int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
+static __inline__
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
 static __inline__
 __int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-long _InterlockedOr_np(long volatile *_Value, long _Mask);
-short _InterlockedOr16_np(short volatile *_Value, short _Mask);
 static __inline__
 __int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
-char _InterlockedOr8_np(char volatile *_Value, char _Mask);
-long _InterlockedXor_np(long volatile *_Value, long _Mask);
-short _InterlockedXor16_np(short volatile *_Value, short _Mask);
 static __inline__
 __int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
-char _InterlockedXor8_np(char volatile *_Value, char _Mask);
 static __inline__
-__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
-                __int64 *_HighProduct);
-unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
-__int64 _sarx_i64(__int64, unsigned int);
-#if __STDC_HOSTED__
-int __cdecl _setjmpex(jmp_buf);
-#endif
-unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
-unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
-/*
- * Multiply two 64-bit integers and obtain a 64-bit result.
- * The low-half is returned directly and the high half is in an out parameter.
- */
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
-         unsigned __int64 *_HighProduct) {
-  unsigned __int128 _FullProduct =
-      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
-  *_HighProduct = _FullProduct >> 64;
-  return _FullProduct;
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
-  unsigned __int128 _FullProduct =
-      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
-  return _FullProduct >> 64;
-}
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
 
-#endif /* __x86_64__ */
-
-/*----------------------------------------------------------------------------*\
-|* Multiplication
-\*----------------------------------------------------------------------------*/
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-__emul(int __in1, int __in2) {
-  return (__int64)__in1 * (__int64)__in2;
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__emulu(unsigned int __in1, unsigned int __in2) {
-  return (unsigned __int64)__in1 * (unsigned __int64)__in2;
-}
-/*----------------------------------------------------------------------------*\
-|* Bit Twiddling
-\*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_rotl8(unsigned char _Value, unsigned char _Shift) {
-  _Shift &= 0x7;
-  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_rotr8(unsigned char _Value, unsigned char _Shift) {
-  _Shift &= 0x7;
-  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-_rotl16(unsigned short _Value, unsigned char _Shift) {
-  _Shift &= 0xf;
-  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-_rotr16(unsigned short _Value, unsigned char _Shift) {
-  _Shift &= 0xf;
-  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rotl(unsigned int _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rotr(unsigned int _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-_lrotl(unsigned long _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-_lrotr(unsigned long _Value, int _Shift) {
-  _Shift &= 0x1f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
-}
-static
-__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_rotl64(unsigned __int64 _Value, int _Shift) {
-  _Shift &= 0x3f;
-  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
-}
-static
-__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_rotr64(unsigned __int64 _Value, int _Shift) {
-  _Shift &= 0x3f;
-  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
-}
-/*----------------------------------------------------------------------------*\
-|* Bit Counting and Testing
-\*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = __builtin_ctzl(_Mask);
-  return 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = 31 - __builtin_clzl(_Mask);
-  return 1;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__popcnt16(unsigned short _Value) {
-  return __builtin_popcount((int)_Value);
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__popcnt(unsigned int _Value) {
-  return __builtin_popcount(_Value);
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittest(long const *_BitBase, long _BitPos) {
-  return (*_BitBase >> _BitPos) & 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandcomplement(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase ^ (1 << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandreset(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase & ~(1 << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandset(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase | (1 << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
-  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
-  return (_PrevVal >> _BitPos) & 1;
-}
-#ifdef __x86_64__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = __builtin_ctzll(_Mask);
-  return 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
-  if (!_Mask)
-    return 0;
-  *_Index = 63 - __builtin_clzll(_Mask);
-  return 1;
-}
-static __inline__
-unsigned __int64 __DEFAULT_FN_ATTRS
-__popcnt64(unsigned __int64 _Value) {
-  return __builtin_popcountll(_Value);
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
-  return (*_BitBase >> _BitPos) & 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase ^ (1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase & ~(1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase | (1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
-  long long _PrevVal =
-      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
-  return (_PrevVal >> _BitPos) & 1;
-}
 #endif
+
 /*----------------------------------------------------------------------------*\
 |* Interlocked Exchange Add
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
-  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange Sub
-\*----------------------------------------------------------------------------*/
-static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
 }
 static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
 }
-#ifdef __x86_64__
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
-  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Increment
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedIncrement16(short volatile *_Value) {
-  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedIncrement16_acq(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
 }
-#ifdef __x86_64__
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_nf(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_rel(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_acq(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_nf(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_rel(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedIncrement64(__int64 volatile *_Value) {
-  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedIncrement64_acq(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_nf(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_rel(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Decrement
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedDecrement16(short volatile *_Value) {
-  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedDecrement16_acq(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
 }
-#ifdef __x86_64__
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_nf(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_rel(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_acq(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_nf(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_rel(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedDecrement64(__int64 volatile *_Value) {
-  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedDecrement64_acq(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_nf(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_rel(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked And
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedAnd8(char volatile *_Value, char _Mask) {
-  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedAnd16(short volatile *_Value, short _Mask) {
-  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedAnd(long volatile *_Value, long _Mask) {
-  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
 }
-#ifdef __x86_64__
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
 }
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
+                                           long _BitPos);
+unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
+                                              long _BitPos);
+unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
+                                             long _BitPos);
+unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
+                                              long _BitPos);
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Or
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedOr8(char volatile *_Value, char _Mask) {
-  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedOr16(short volatile *_Value, short _Mask) {
-  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedOr(long volatile *_Value, long _Mask) {
-  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
 }
-#ifdef __x86_64__
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Xor
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedXor8(char volatile *_Value, char _Mask) {
-  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedXor16(short volatile *_Value, short _Mask) {
-  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
 }
 static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedXor(long volatile *_Value, long _Mask) {
-  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
 }
-#ifdef __x86_64__
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Exchange
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchange8(char volatile *_Target, char _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange8_acq(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_nf(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_rel(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
   return _Value;
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchange16(short volatile *_Target, short _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange16_acq(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
   return _Value;
 }
-#ifdef __x86_64__
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_nf(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_rel(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_acq(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_nf(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_rel(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
   return _Value;
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Compare Exchange
 \*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
 static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange8(char volatile *_Destination,
+_InterlockedCompareExchange8_acq(char volatile *_Destination,
                              char _Exchange, char _Comparand) {
   __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_nf(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_rel(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
   return _Comparand;
 }
 static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange16(short volatile *_Destination,
+_InterlockedCompareExchange16_acq(short volatile *_Destination,
                               short _Exchange, short _Comparand) {
   __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_nf(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_rel(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_acq(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_nf(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_rel(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
   return _Comparand;
 }
 static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange64(__int64 volatile *_Destination,
+_InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
                               __int64 _Exchange, __int64 _Comparand) {
   __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
-                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
   return _Comparand;
 }
-/*----------------------------------------------------------------------------*\
-|* Barriers
-\*----------------------------------------------------------------------------*/
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadWriteBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_WriteBarrier(void) {
-  __atomic_signal_fence(__ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__faststorefence(void) {
-  __atomic_thread_fence(__ATOMIC_SEQ_CST);
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
 }
 #endif
-/*----------------------------------------------------------------------------*\
-|* readfs, readgs
-|* (Pointers in address space #256 and #257 are relative to the GS and FS
-|* segment registers, respectively.)
-\*----------------------------------------------------------------------------*/
-#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
-    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
-    (__offset))
 
-#ifdef __i386__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-__readfsbyte(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned char, __offset);
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__readfsword(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned short, __offset);
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__readfsqword(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
-}
-#endif
-#ifdef __x86_64__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-__readgsbyte(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned char, __offset);
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__readgsword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned short, __offset);
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-__readgsdword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned long, __offset);
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__readgsqword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
-}
-#endif
-#undef __ptr_to_addr_space
 /*----------------------------------------------------------------------------*\
 |* movs, stos
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
-  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
+  __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       : : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
-  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
+  __asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       : : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
-  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
+  __asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       : : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
-  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
+  __asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x)
+                       : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
+  __asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x)
+                       : "memory");
 }
 #endif
 #ifdef __x86_64__
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
-  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
-                        : "%edi", "%esi", "%ecx");
+  __asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       : : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
-  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
-                        : "%edi", "%ecx");
+  __asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x)
+                       : "memory");
 }
 #endif
 
 /*----------------------------------------------------------------------------*\
 |* Misc
 \*----------------------------------------------------------------------------*/
-static __inline__ void * __DEFAULT_FN_ATTRS
-_AddressOfReturnAddress(void) {
-  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
-}
-static __inline__ void * __DEFAULT_FN_ATTRS
-_ReturnAddress(void) {
-  return __builtin_return_address(0);
-}
 #if defined(__i386__) || defined(__x86_64__)
 static __inline__ void __DEFAULT_FN_ATTRS
 __cpuid(int __info[4], int __level) {
@@ -914,6 +858,24 @@
 __halt(void) {
   __asm__ volatile ("hlt");
 }
+static __inline__ void __DEFAULT_FN_ATTRS
+__nop(void) {
+  __asm__ volatile ("nop");
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__shiftleft128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) {
+  unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l;
+  unsigned __int128 __res = __val << (__d & 63);
+  return (unsigned __int64)(__res >> 64);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__shiftright128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) {
+  unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l;
+  unsigned __int128 __res = __val >> (__d & 63);
+  return (unsigned __int64)__res;
+}
 #endif
 
 /*----------------------------------------------------------------------------*\
diff --git a/darwin-x86/clang-headers/invpcidintrin.h b/darwin-x86/clang-headers/invpcidintrin.h
new file mode 100644
index 0000000..c30a19f
--- /dev/null
+++ b/darwin-x86/clang-headers/invpcidintrin.h
@@ -0,0 +1,37 @@
+/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __INVPCIDINTRIN_H
+#define __INVPCIDINTRIN_H
+
+static __inline__ void
+  __attribute__((__always_inline__, __nodebug__,  __target__("invpcid")))
+_invpcid(unsigned int __type, void *__descriptor) {
+  __builtin_ia32_invpcid(__type, __descriptor);
+}
+
+#endif /* __INVPCIDINTRIN_H */
diff --git a/darwin-x86/clang-headers/lwpintrin.h b/darwin-x86/clang-headers/lwpintrin.h
new file mode 100644
index 0000000..0b28d73
--- /dev/null
+++ b/darwin-x86/clang-headers/lwpintrin.h
@@ -0,0 +1,150 @@
+/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LWPINTRIN_H
+#define __LWPINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
+
+/// Parses the LWPCB at the specified address and enables
+///        profiling if valid.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
+///
+/// \param __addr
+///    Address to the new Lightweight Profiling Control Block (LWPCB). If the
+///    LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
+///    Lightweight Profiling.
+static __inline__ void __DEFAULT_FN_ATTRS
+__llwpcb (void *__addr)
+{
+  __builtin_ia32_llwpcb(__addr);
+}
+
+/// Flushes the LWP state to memory and returns the address of the LWPCB.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
+///
+/// \return
+///    Address to the current Lightweight Profiling Control Block (LWPCB).
+///    If LWP is not currently enabled, returns NULL.
+static __inline__ void* __DEFAULT_FN_ATTRS
+__slwpcb (void)
+{
+  return __builtin_ia32_slwpcb();
+}
+
+/// Inserts programmed event record into the LWP event ring buffer
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+///    the event record overwrites the last record in the buffer, the MissedEvents
+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
+///    1 is returned. Otherwise 0 is returned.
+#define __lwpins32(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+/// Decrements the LWP programmed value sample event counter. If the result is
+///        negative, inserts an event record into the LWP event ring buffer in memory
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval32(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+#ifdef __x86_64__
+
+/// Inserts programmed event record into the LWP event ring buffer
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+///    A 64-bit value is inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+///    the event record overwrites the last record in the buffer, the MissedEvents
+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
+///    1 is returned. Otherwise 0 is returned.
+#define __lwpins64(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+/// Decrements the LWP programmed value sample event counter. If the result is
+///        negative, inserts an event record into the LWP event ring buffer in memory
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+///    A 64-bit value is and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval64(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LWPINTRIN_H */
diff --git a/darwin-x86/clang-headers/lzcntintrin.h b/darwin-x86/clang-headers/lzcntintrin.h
index 4c00e42..558f182 100644
--- a/darwin-x86/clang-headers/lzcntintrin.h
+++ b/darwin-x86/clang-headers/lzcntintrin.h
@@ -31,18 +31,50 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
 
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of leading zero
+///    bits in the operand.
 static __inline__ unsigned short __DEFAULT_FN_ATTRS
 __lzcnt16(unsigned short __X)
 {
   return __X ? __builtin_clzs(__X) : 16;
 }
 
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+/// \see _lzcnt_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __lzcnt32(unsigned int __X)
 {
   return __X ? __builtin_clz(__X) : 32;
 }
 
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+/// \see __lzcnt32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _lzcnt_u32(unsigned int __X)
 {
@@ -50,12 +82,34 @@
 }
 
 #ifdef __x86_64__
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+/// \see _lzcnt_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __lzcnt64(unsigned long long __X)
 {
   return __X ? __builtin_clzll(__X) : 64;
 }
 
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+/// \see __lzcnt64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _lzcnt_u64(unsigned long long __X)
 {
diff --git a/darwin-x86/clang-headers/mm3dnow.h b/darwin-x86/clang-headers/mm3dnow.h
index 294866c..b028875 100644
--- a/darwin-x86/clang-headers/mm3dnow.h
+++ b/darwin-x86/clang-headers/mm3dnow.h
@@ -30,9 +30,9 @@
 typedef float __v2sf __attribute__((__vector_size__(8)));
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64)))
 
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
 _m_femms(void) {
   __builtin_ia32_femms();
 }
@@ -134,7 +134,7 @@
 
 /* Handle the 3dnowa instructions here. */
 #undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64)))
 
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pf2iw(__m64 __m) {
diff --git a/darwin-x86/clang-headers/mmintrin.h b/darwin-x86/clang-headers/mmintrin.h
index cefd605..a735399 100644
--- a/darwin-x86/clang-headers/mmintrin.h
+++ b/darwin-x86/clang-headers/mmintrin.h
@@ -32,27 +32,27 @@
 typedef char __v8qi __attribute__((__vector_size__(8)));
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
 
-/// \brief Clears the MMX state by setting the state of the x87 stack registers
+/// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c EMMS instruction.
+/// This intrinsic corresponds to the <c> EMMS </c> instruction.
 ///
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void  __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
 _mm_empty(void)
 {
     __builtin_ia32_emms();
 }
 
-/// \brief Constructs a 64-bit integer vector, setting the lower 32 bits to the
+/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
 ///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> MOVD </c> instruction.
 ///
 /// \param __i
 ///    A 32-bit integer value.
@@ -64,12 +64,12 @@
     return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
 }
 
-/// \brief Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
+/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
 ///    signed integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> MOVD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector.
@@ -81,11 +81,11 @@
     return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
 }
 
-/// \brief Casts a 64-bit signed integer value into a 64-bit integer vector.
+/// Casts a 64-bit signed integer value into a 64-bit integer vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction.
+/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
 ///
 /// \param __i
 ///    A 64-bit signed integer.
@@ -97,11 +97,11 @@
     return (__m64)__i;
 }
 
-/// \brief Casts a 64-bit integer vector into a 64-bit signed integer value.
+/// Casts a 64-bit integer vector into a 64-bit signed integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction.
+/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector.
@@ -113,7 +113,7 @@
     return (long long)__m;
 }
 
-/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+/// Converts 16-bit signed integers from both 64-bit integer vector
 ///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
 ///    a 64-bit integer vector of [8 x i8] as the result. Positive values
 ///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
@@ -121,7 +121,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PACKSSWB instruction.
+/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
@@ -143,7 +143,7 @@
     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Converts 32-bit signed integers from both 64-bit integer vector
+/// Converts 32-bit signed integers from both 64-bit integer vector
 ///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
 ///    a 64-bit integer vector of [4 x i16] as the result. Positive values
 ///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
@@ -151,7 +151,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PACKSSDW instruction.
+/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
@@ -173,7 +173,7 @@
     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }
 
-/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+/// Converts 16-bit signed integers from both 64-bit integer vector
 ///    parameters of [4 x i16] into 8-bit unsigned integer values, and
 ///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
 ///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
@@ -181,7 +181,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PACKUSWB instruction.
+/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
@@ -203,24 +203,24 @@
     return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKHBW instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [8 x i8].
-///    Bits [39:32] are written to bits [7:0] of the result.
-///    Bits [47:40] are written to bits [23:16] of the result.
-///    Bits [55:48] are written to bits [39:32] of the result.
+///    A 64-bit integer vector of [8 x i8]. \n
+///    Bits [39:32] are written to bits [7:0] of the result. \n
+///    Bits [47:40] are written to bits [23:16] of the result. \n
+///    Bits [55:48] are written to bits [39:32] of the result. \n
 ///    Bits [63:56] are written to bits [55:48] of the result.
 /// \param __m2
 ///    A 64-bit integer vector of [8 x i8].
-///    Bits [39:32] are written to bits [15:8] of the result.
-///    Bits [47:40] are written to bits [31:24] of the result.
-///    Bits [55:48] are written to bits [47:40] of the result.
+///    Bits [39:32] are written to bits [15:8] of the result. \n
+///    Bits [47:40] are written to bits [31:24] of the result. \n
+///    Bits [55:48] are written to bits [47:40] of the result. \n
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
@@ -230,20 +230,20 @@
     return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of
 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKHWD instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
-///    Bits [47:32] are written to bits [15:0] of the result.
+///    Bits [47:32] are written to bits [15:0] of the result. \n
 ///    Bits [63:48] are written to bits [47:32] of the result.
 /// \param __m2
 ///    A 64-bit integer vector of [4 x i16].
-///    Bits [47:32] are written to bits [31:16] of the result.
+///    Bits [47:32] are written to bits [31:16] of the result. \n
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
@@ -253,12 +253,12 @@
     return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of
 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKHDQ instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
@@ -274,24 +274,24 @@
     return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
 }
 
-/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKLBW instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
-///    Bits [7:0] are written to bits [7:0] of the result.
-///    Bits [15:8] are written to bits [23:16] of the result.
-///    Bits [23:16] are written to bits [39:32] of the result.
+///    Bits [7:0] are written to bits [7:0] of the result. \n
+///    Bits [15:8] are written to bits [23:16] of the result. \n
+///    Bits [23:16] are written to bits [39:32] of the result. \n
 ///    Bits [31:24] are written to bits [55:48] of the result.
 /// \param __m2
 ///    A 64-bit integer vector of [8 x i8].
-///    Bits [7:0] are written to bits [15:8] of the result.
-///    Bits [15:8] are written to bits [31:24] of the result.
-///    Bits [23:16] are written to bits [47:40] of the result.
+///    Bits [7:0] are written to bits [15:8] of the result. \n
+///    Bits [15:8] are written to bits [31:24] of the result. \n
+///    Bits [23:16] are written to bits [47:40] of the result. \n
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
@@ -301,20 +301,20 @@
     return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of
 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKLWD instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
-///    Bits [15:0] are written to bits [15:0] of the result.
+///    Bits [15:0] are written to bits [15:0] of the result. \n
 ///    Bits [31:16] are written to bits [47:32] of the result.
 /// \param __m2
 ///    A 64-bit integer vector of [4 x i16].
-///    Bits [15:0] are written to bits [31:16] of the result.
+///    Bits [15:0] are written to bits [31:16] of the result. \n
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
@@ -324,12 +324,12 @@
     return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of
 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PUNPCKLDQ instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
@@ -345,14 +345,14 @@
     return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
 }
 
-/// \brief Adds each 8-bit integer element of the first 64-bit integer vector
+/// Adds each 8-bit integer element of the first 64-bit integer vector
 ///    of [8 x i8] to the corresponding 8-bit integer element of the second
 ///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
 ///    packed into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDB instruction.
+/// This intrinsic corresponds to the <c> PADDB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -366,14 +366,14 @@
     return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Adds each 16-bit integer element of the first 64-bit integer vector
+/// Adds each 16-bit integer element of the first 64-bit integer vector
 ///    of [4 x i16] to the corresponding 16-bit integer element of the second
 ///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
 ///    packed into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDW instruction.
+/// This intrinsic corresponds to the <c> PADDW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -387,14 +387,14 @@
     return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Adds each 32-bit integer element of the first 64-bit integer vector
+/// Adds each 32-bit integer element of the first 64-bit integer vector
 ///    of [2 x i32] to the corresponding 32-bit integer element of the second
 ///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
 ///    packed into a 64-bit integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDD instruction.
+/// This intrinsic corresponds to the <c> PADDD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32].
@@ -408,7 +408,7 @@
     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// \brief Adds each 8-bit signed integer element of the first 64-bit integer
+/// Adds each 8-bit signed integer element of the first 64-bit integer
 ///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
 ///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
 ///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
@@ -416,7 +416,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDSB instruction.
+/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -430,7 +430,7 @@
     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Adds each 16-bit signed integer element of the first 64-bit integer
+/// Adds each 16-bit signed integer element of the first 64-bit integer
 ///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
 ///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
 ///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
@@ -439,7 +439,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDSW instruction.
+/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -453,7 +453,7 @@
     return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Adds each 8-bit unsigned integer element of the first 64-bit integer
+/// Adds each 8-bit unsigned integer element of the first 64-bit integer
 ///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
 ///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
 ///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
@@ -461,7 +461,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDUSB instruction.
+/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -475,7 +475,7 @@
     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Adds each 16-bit unsigned integer element of the first 64-bit integer
+/// Adds each 16-bit unsigned integer element of the first 64-bit integer
 ///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
 ///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
 ///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
@@ -483,7 +483,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PADDUSW instruction.
+/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -497,14 +497,14 @@
     return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Subtracts each 8-bit integer element of the second 64-bit integer
+/// Subtracts each 8-bit integer element of the second 64-bit integer
 ///    vector of [8 x i8] from the corresponding 8-bit integer element of the
 ///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
 ///    are packed into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBB instruction.
+/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -518,14 +518,14 @@
     return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Subtracts each 16-bit integer element of the second 64-bit integer
+/// Subtracts each 16-bit integer element of the second 64-bit integer
 ///    vector of [4 x i16] from the corresponding 16-bit integer element of the
 ///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
 ///    results are packed into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBW instruction.
+/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -539,14 +539,14 @@
     return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Subtracts each 32-bit integer element of the second 64-bit integer
+/// Subtracts each 32-bit integer element of the second 64-bit integer
 ///    vector of [2 x i32] from the corresponding 32-bit integer element of the
 ///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
 ///    results are packed into a 64-bit integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBD instruction.
+/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32] containing the minuends.
@@ -560,7 +560,7 @@
     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// \brief Subtracts each 8-bit signed integer element of the second 64-bit
+/// Subtracts each 8-bit signed integer element of the second 64-bit
 ///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
 ///    element of the first 64-bit integer vector of [8 x i8]. Positive results
 ///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
@@ -569,7 +569,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBSB instruction.
+/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -583,7 +583,7 @@
     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Subtracts each 16-bit signed integer element of the second 64-bit
+/// Subtracts each 16-bit signed integer element of the second 64-bit
 ///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
 ///    element of the first 64-bit integer vector of [4 x i16]. Positive results
 ///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
@@ -592,7 +592,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBSW instruction.
+/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -606,16 +606,17 @@
     return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
+/// Subtracts each 8-bit unsigned integer element of the second 64-bit
 ///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
-///    element of the first 64-bit integer vector of [8 x i8]. If an element of
-///    the first vector is less than the corresponding element of the second
-///    vector, the result is saturated to 0. The results are packed into a
-///    64-bit integer vector of [8 x i8].
+///    element of the first 64-bit integer vector of [8 x i8].
+///
+///    If an element of the first vector is less than the corresponding element
+///    of the second vector, the result is saturated to 0. The results are
+///    packed into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBUSB instruction.
+/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -629,16 +630,17 @@
     return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
+/// Subtracts each 16-bit unsigned integer element of the second 64-bit
 ///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
-///    integer element of the first 64-bit integer vector of [4 x i16]. If an
-///    element of the first vector is less than the corresponding element of the
-///    second vector, the result is saturated to 0. The results are packed into
-///    a 64-bit integer vector of [4 x i16].
+///    integer element of the first 64-bit integer vector of [4 x i16].
+///
+///    If an element of the first vector is less than the corresponding element
+///    of the second vector, the result is saturated to 0. The results are
+///    packed into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSUBUSW instruction.
+/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -652,18 +654,20 @@
     return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+/// Multiplies each 16-bit signed integer element of the first 64-bit
 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 ///    element of the second 64-bit integer vector of [4 x i16] and get four
 ///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
 ///    The lower 32 bits of these two sums are packed into a 64-bit integer
-///    vector of [2 x i32]. For example, bits [15:0] of both parameters are
-///    multiplied, bits [31:16] of both parameters are multiplied, and the sum
-///    of both results is written to bits [31:0] of the result.
+///    vector of [2 x i32].
+///
+///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
+///    of both parameters are multiplied, and the sum of both results is written
+///    to bits [31:0] of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMADDWD instruction.
+/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -677,14 +681,14 @@
     return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+/// Multiplies each 16-bit signed integer element of the first 64-bit
 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMULHW instruction.
+/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -698,14 +702,14 @@
     return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+/// Multiplies each 16-bit signed integer element of the first 64-bit
 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMULLW instruction.
+/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -719,7 +723,7 @@
     return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Left-shifts each 16-bit signed integer element of the first
+/// Left-shifts each 16-bit signed integer element of the first
 ///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
 ///    of bits specified by the second parameter, which is a 64-bit integer. The
 ///    lower 16 bits of the results are packed into a 64-bit integer vector of
@@ -727,42 +731,44 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLW instruction.
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-///    values. If __count is greater or equal to 16, the result is set to all 0.
+///    values. If \a __count is greater or equal to 16, the result is set to all
+///    0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
 }
 
-/// \brief Left-shifts each 16-bit signed integer element of a 64-bit integer
+/// Left-shifts each 16-bit signed integer element of a 64-bit integer
 ///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
 ///    The lower 16 bits of the results are packed into a 64-bit integer vector
 ///    of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLW instruction.
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-///    values. If __count is greater or equal to 16, the result is set to all 0.
+///    values. If \a __count is greater or equal to 16, the result is set to all
+///    0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi16(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
 }
 
-/// \brief Left-shifts each 32-bit signed integer element of the first
+/// Left-shifts each 32-bit signed integer element of the first
 ///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
 ///    of bits specified by the second parameter, which is a 64-bit integer. The
 ///    lower 32 bits of the results are packed into a 64-bit integer vector of
@@ -770,91 +776,94 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLD instruction.
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-///    values. If __count is greater or equal to 32, the result is set to all 0.
+///    values. If \a __count is greater or equal to 32, the result is set to all
+///    0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
 }
 
-/// \brief Left-shifts each 32-bit signed integer element of a 64-bit integer
+/// Left-shifts each 32-bit signed integer element of a 64-bit integer
 ///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
 ///    The lower 32 bits of the results are packed into a 64-bit integer vector
 ///    of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLD instruction.
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-///    values. If __count is greater or equal to 32, the result is set to all 0.
+///    values. If \a __count is greater or equal to 32, the result is set to all
+///    0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
 }
 
-/// \brief Left-shifts the first 64-bit integer parameter by the number of bits
+/// Left-shifts the first 64-bit integer parameter by the number of bits
 ///    specified by the second 64-bit integer parameter. The lower 64 bits of
 ///    result are returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLQ instruction.
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
-///     __count is greater or equal to 64, the result is set to 0.
+///     \a __count is greater or equal to 64, the result is set to 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
 }
 
-/// \brief Left-shifts the first parameter, which is a 64-bit integer, by the
+/// Left-shifts the first parameter, which is a 64-bit integer, by the
 ///    number of bits specified by the second parameter, which is a 32-bit
 ///    integer. The lower 64 bits of result are returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSLLQ instruction.
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
-///     __count is greater or equal to 64, the result is set to 0.
+///     \a __count is greater or equal to 64, the result is set to 0.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_si64(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
 }
 
-/// \brief Right-shifts each 16-bit integer element of the first parameter,
+/// Right-shifts each 16-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer. High-order
-///    bits are filled with the sign bit of the initial value of each 16-bit
-///    element. The 16-bit results are packed into a 64-bit integer vector of
-///    [4 x i16].
+///    specified by the second parameter, which is a 64-bit integer.
+///
+///    High-order bits are filled with the sign bit of the initial value of each
+///    16-bit element. The 16-bit results are packed into a 64-bit integer
+///    vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRAW instruction.
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
@@ -868,15 +877,16 @@
     return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
 }
 
-/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+/// Right-shifts each 16-bit integer element of a 64-bit integer vector
 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///
 ///    High-order bits are filled with the sign bit of the initial value of each
 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
 ///    vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRAW instruction.
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
@@ -890,16 +900,17 @@
     return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
 }
 
-/// \brief Right-shifts each 32-bit integer element of the first parameter,
+/// Right-shifts each 32-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer. High-order
-///    bits are filled with the sign bit of the initial value of each 32-bit
-///    element. The 32-bit results are packed into a 64-bit integer vector of
-///    [2 x i32].
+///    specified by the second parameter, which is a 64-bit integer.
+///
+///    High-order bits are filled with the sign bit of the initial value of each
+///    32-bit element. The 32-bit results are packed into a 64-bit integer
+///    vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRAD instruction.
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
@@ -913,15 +924,16 @@
     return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
 }
 
-/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+/// Right-shifts each 32-bit integer element of a 64-bit integer vector
 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///
 ///    High-order bits are filled with the sign bit of the initial value of each
 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
 ///    vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRAD instruction.
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
@@ -935,15 +947,16 @@
     return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
 }
 
-/// \brief Right-shifts each 16-bit integer element of the first parameter,
+/// Right-shifts each 16-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer. High-order
-///    bits are cleared. The 16-bit results are packed into a 64-bit integer
-///    vector of [4 x i16].
+///    specified by the second parameter, which is a 64-bit integer.
+///
+///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
+///    integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLW instruction.
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
@@ -957,14 +970,15 @@
     return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
 }
 
-/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+/// Right-shifts each 16-bit integer element of a 64-bit integer vector
 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///
 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
 ///    integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLW instruction.
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [4 x i16].
@@ -978,15 +992,16 @@
     return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
 }
 
-/// \brief Right-shifts each 32-bit integer element of the first parameter,
+/// Right-shifts each 32-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer. High-order
-///    bits are cleared. The 32-bit results are packed into a 64-bit integer
-///    vector of [2 x i32].
+///    specified by the second parameter, which is a 64-bit integer.
+///
+///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
+///    integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLD instruction.
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
@@ -1000,14 +1015,15 @@
     return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
 }
 
-/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+/// Right-shifts each 32-bit integer element of a 64-bit integer vector
 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///
 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
 ///    integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLD instruction.
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector of [2 x i32].
@@ -1021,13 +1037,14 @@
     return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
 }
 
-/// \brief Right-shifts the first 64-bit integer parameter by the number of bits
-///    specified by the second 64-bit integer parameter. High-order bits are
-///    cleared.
+/// Right-shifts the first 64-bit integer parameter by the number of bits
+///    specified by the second 64-bit integer parameter.
+///
+///    High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLQ instruction.
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
@@ -1040,13 +1057,15 @@
     return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
 }
 
-/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
+/// Right-shifts the first parameter, which is a 64-bit integer, by the
 ///    number of bits specified by the second parameter, which is a 32-bit
-///    integer. High-order bits are cleared.
+///    integer.
+///
+///    High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSRLQ instruction.
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
@@ -1059,11 +1078,11 @@
     return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
 }
 
-/// \brief Performs a bitwise AND of two 64-bit integer vectors.
+/// Performs a bitwise AND of two 64-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PAND instruction.
+/// This intrinsic corresponds to the <c> PAND </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector.
@@ -1077,13 +1096,13 @@
     return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
 }
 
-/// \brief Performs a bitwise NOT of the first 64-bit integer vector, and then
+/// Performs a bitwise NOT of the first 64-bit integer vector, and then
 ///    performs a bitwise AND of the intermediate result and the second 64-bit
 ///    integer vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PANDN instruction.
+/// This intrinsic corresponds to the <c> PANDN </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector. The one's complement of this parameter is used
@@ -1098,11 +1117,11 @@
     return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
 }
 
-/// \brief Performs a bitwise OR of two 64-bit integer vectors.
+/// Performs a bitwise OR of two 64-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POR instruction.
+/// This intrinsic corresponds to the <c> POR </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector.
@@ -1116,11 +1135,11 @@
     return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
 }
 
-/// \brief Performs a bitwise exclusive OR of two 64-bit integer vectors.
+/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PXOR instruction.
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector.
@@ -1134,14 +1153,15 @@
     return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
 }
 
-/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+/// Compares the 8-bit integer elements of two 64-bit integer vectors of
 ///    [8 x i8] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector. The comparison yields 0 for
-///    false, 0xFF for true.
+///    corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPEQB instruction.
+/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -1155,14 +1175,15 @@
     return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+/// Compares the 16-bit integer elements of two 64-bit integer vectors of
 ///    [4 x i16] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector. The comparison yields 0 for
-///    false, 0xFFFF for true.
+///    corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPEQW instruction.
+/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -1176,14 +1197,15 @@
     return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+/// Compares the 32-bit integer elements of two 64-bit integer vectors of
 ///    [2 x i32] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector. The comparison yields 0 for
-///    false, 0xFFFFFFFF for true.
+///    corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPEQD instruction.
+/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32].
@@ -1197,14 +1219,15 @@
     return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+/// Compares the 8-bit integer elements of two 64-bit integer vectors of
 ///    [8 x i8] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector. The comparison yields 0
-///    for false, 0xFF for true.
+///    the corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [8 x i8].
@@ -1218,14 +1241,15 @@
     return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+/// Compares the 16-bit integer elements of two 64-bit integer vectors of
 ///    [4 x i16] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector. The comparison yields 0
-///    for false, 0xFFFF for true.
+///    the corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [4 x i16].
@@ -1239,14 +1263,15 @@
     return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+/// Compares the 32-bit integer elements of two 64-bit integer vectors of
 ///    [2 x i32] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector. The comparison yields 0
-///    for false, 0xFFFFFFFF for true.
+///    the corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
 ///
 /// \param __m1
 ///    A 64-bit integer vector of [2 x i32].
@@ -1260,20 +1285,20 @@
     return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// \brief Constructs a 64-bit integer vector initialized to zero.
+/// Constructs a 64-bit integer vector initialized to zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setzero_si64(void)
 {
-    return (__m64){ 0LL };
+    return __extension__ (__m64){ 0LL };
 }
 
-/// \brief Constructs a 64-bit integer vector initialized with the specified
+/// Constructs a 64-bit integer vector initialized with the specified
 ///    32-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1294,7 +1319,7 @@
     return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
 }
 
-/// \brief Constructs a 64-bit integer vector initialized with the specified
+/// Constructs a 64-bit integer vector initialized with the specified
 ///    16-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1317,7 +1342,7 @@
     return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
 }
 
-/// \brief Constructs a 64-bit integer vector initialized with the specified
+/// Constructs a 64-bit integer vector initialized with the specified
 ///    8-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1350,13 +1375,14 @@
                                                __b4, __b5, __b6, __b7);
 }
 
-/// \brief Constructs a 64-bit integer vector of [2 x i32], with each of the
+/// Constructs a 64-bit integer vector of [2 x i32], with each of the
 ///    32-bit integer vector elements set to the specified 32-bit integer
 ///    value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction.
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
 ///
 /// \param __i
 ///    A 32-bit integer value used to initialize each vector element of the
@@ -1368,13 +1394,14 @@
     return _mm_set_pi32(__i, __i);
 }
 
-/// \brief Constructs a 64-bit integer vector of [4 x i16], with each of the
+/// Constructs a 64-bit integer vector of [4 x i16], with each of the
 ///    16-bit integer vector elements set to the specified 16-bit integer
 ///    value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction.
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
 ///
 /// \param __w
 ///    A 16-bit integer value used to initialize each vector element of the
@@ -1386,13 +1413,13 @@
     return _mm_set_pi16(__w, __w, __w, __w);
 }
 
-/// \brief Constructs a 64-bit integer vector of [8 x i8], with each of the
+/// Constructs a 64-bit integer vector of [8 x i8], with each of the
 ///    8-bit integer vector elements set to the specified 8-bit integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPUNPCKLBW + VPSHUFLW / \c PUNPCKLBW +
-///    PSHUFLW instruction.
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
 ///
 /// \param __b
 ///    An 8-bit integer value used to initialize each vector element of the
@@ -1404,7 +1431,7 @@
     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
 }
 
-/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+/// Constructs a 64-bit integer vector, initialized in reverse order with
 ///    the specified 32-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1425,7 +1452,7 @@
     return _mm_set_pi32(__i1, __i0);
 }
 
-/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+/// Constructs a 64-bit integer vector, initialized in reverse order with
 ///    the specified 16-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1448,7 +1475,7 @@
     return _mm_set_pi16(__w3, __w2, __w1, __w0);
 }
 
-/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+/// Constructs a 64-bit integer vector, initialized in reverse order with
 ///    the specified 8-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
diff --git a/darwin-x86/clang-headers/module.modulemap b/darwin-x86/clang-headers/module.modulemap
index 3e40d2c..1d1af57 100644
--- a/darwin-x86/clang-headers/module.modulemap
+++ b/darwin-x86/clang-headers/module.modulemap
@@ -38,6 +38,7 @@
     explicit module neon {
       requires neon
       header "arm_neon.h"
+      header "arm_fp16.h"
       export *
     }
   }
@@ -61,13 +62,27 @@
     textual header "xopintrin.h"
     textual header "fma4intrin.h"
     textual header "mwaitxintrin.h"
+    textual header "clzerointrin.h"
+    textual header "wbnoinvdintrin.h"
+    textual header "cldemoteintrin.h"
+    textual header "waitpkgintrin.h"
+    textual header "movdirintrin.h"
+    textual header "pconfigintrin.h"
+    textual header "sgxintrin.h"
+    textual header "ptwriteintrin.h"
+    textual header "invpcidintrin.h"
+
+    textual header "__wmmintrin_aes.h"
+    textual header "__wmmintrin_pclmul.h"
 
     explicit module mm_malloc {
+      requires !freestanding
       header "mm_malloc.h"
       export * // note: for <stdlib.h> dependency
     }
 
     explicit module cpuid {
+      requires gnuinlineasm
       header "cpuid.h"
     }
 
@@ -125,14 +140,6 @@
       export aes
       export pclmul
     }
-
-    explicit module aes {
-      header "__wmmintrin_aes.h"
-    }
-
-    explicit module pclmul {
-      header "__wmmintrin_pclmul.h"
-    }
   }
 
   explicit module systemz {
diff --git a/darwin-x86/clang-headers/movdirintrin.h b/darwin-x86/clang-headers/movdirintrin.h
new file mode 100644
index 0000000..ec20c53
--- /dev/null
+++ b/darwin-x86/clang-headers/movdirintrin.h
@@ -0,0 +1,63 @@
+/*===------------------------- movdirintrin.h ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _MOVDIRINTRIN_H
+#define _MOVDIRINTRIN_H
+
+/* Move doubleword as direct store */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdiri")))
+_directstoreu_u32 (void *__dst, unsigned int  __value)
+{
+  __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value);
+}
+
+#ifdef __x86_64__
+
+/* Move quadword as direct store */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdiri")))
+_directstoreu_u64 (void *__dst, unsigned long __value)
+{
+  __builtin_ia32_directstore_u64((unsigned long *)__dst, __value);
+}
+
+#endif /* __x86_64__ */
+
+/*
+ * movdir64b - Move 64 bytes as direct store.
+ * The destination must be 64 byte aligned, and the store is atomic.
+ * The source address has no alignment requirement, and the load from
+ * the source address is not atomic.
+ */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdir64b")))
+_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src)
+{
+  __builtin_ia32_movdir64b(__dst, __src);
+}
+
+#endif /* _MOVDIRINTRIN_H */
diff --git a/darwin-x86/clang-headers/msa.h b/darwin-x86/clang-headers/msa.h
new file mode 100644
index 0000000..da680f5
--- /dev/null
+++ b/darwin-x86/clang-headers/msa.h
@@ -0,0 +1,583 @@
+/*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MSA_H
+#define _MSA_H 1
+
+#if defined(__mips_msa)
+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
+
+#define __msa_sll_b __builtin_msa_sll_b
+#define __msa_sll_h __builtin_msa_sll_h
+#define __msa_sll_w __builtin_msa_sll_w
+#define __msa_sll_d __builtin_msa_sll_d
+#define __msa_slli_b __builtin_msa_slli_b
+#define __msa_slli_h __builtin_msa_slli_h
+#define __msa_slli_w __builtin_msa_slli_w
+#define __msa_slli_d __builtin_msa_slli_d
+#define __msa_sra_b __builtin_msa_sra_b
+#define __msa_sra_h __builtin_msa_sra_h
+#define __msa_sra_w __builtin_msa_sra_w
+#define __msa_sra_d __builtin_msa_sra_d
+#define __msa_srai_b __builtin_msa_srai_b
+#define __msa_srai_h __builtin_msa_srai_h
+#define __msa_srai_w __builtin_msa_srai_w
+#define __msa_srai_d __builtin_msa_srai_d
+#define __msa_srar_b __builtin_msa_srar_b
+#define __msa_srar_h __builtin_msa_srar_h
+#define __msa_srar_w __builtin_msa_srar_w
+#define __msa_srar_d __builtin_msa_srar_d
+#define __msa_srari_b __builtin_msa_srari_b
+#define __msa_srari_h __builtin_msa_srari_h
+#define __msa_srari_w __builtin_msa_srari_w
+#define __msa_srari_d __builtin_msa_srari_d
+#define __msa_srl_b __builtin_msa_srl_b
+#define __msa_srl_h __builtin_msa_srl_h
+#define __msa_srl_w __builtin_msa_srl_w
+#define __msa_srl_d __builtin_msa_srl_d
+#define __msa_srli_b __builtin_msa_srli_b
+#define __msa_srli_h __builtin_msa_srli_h
+#define __msa_srli_w __builtin_msa_srli_w
+#define __msa_srli_d __builtin_msa_srli_d
+#define __msa_srlr_b __builtin_msa_srlr_b
+#define __msa_srlr_h __builtin_msa_srlr_h
+#define __msa_srlr_w __builtin_msa_srlr_w
+#define __msa_srlr_d __builtin_msa_srlr_d
+#define __msa_srlri_b __builtin_msa_srlri_b
+#define __msa_srlri_h __builtin_msa_srlri_h
+#define __msa_srlri_w __builtin_msa_srlri_w
+#define __msa_srlri_d __builtin_msa_srlri_d
+#define __msa_bclr_b __builtin_msa_bclr_b
+#define __msa_bclr_h __builtin_msa_bclr_h
+#define __msa_bclr_w __builtin_msa_bclr_w
+#define __msa_bclr_d __builtin_msa_bclr_d
+#define __msa_bclri_b __builtin_msa_bclri_b
+#define __msa_bclri_h __builtin_msa_bclri_h
+#define __msa_bclri_w __builtin_msa_bclri_w
+#define __msa_bclri_d __builtin_msa_bclri_d
+#define __msa_bset_b __builtin_msa_bset_b
+#define __msa_bset_h __builtin_msa_bset_h
+#define __msa_bset_w __builtin_msa_bset_w
+#define __msa_bset_d __builtin_msa_bset_d
+#define __msa_bseti_b __builtin_msa_bseti_b
+#define __msa_bseti_h __builtin_msa_bseti_h
+#define __msa_bseti_w __builtin_msa_bseti_w
+#define __msa_bseti_d __builtin_msa_bseti_d
+#define __msa_bneg_b __builtin_msa_bneg_b
+#define __msa_bneg_h __builtin_msa_bneg_h
+#define __msa_bneg_w __builtin_msa_bneg_w
+#define __msa_bneg_d __builtin_msa_bneg_d
+#define __msa_bnegi_b __builtin_msa_bnegi_b
+#define __msa_bnegi_h __builtin_msa_bnegi_h
+#define __msa_bnegi_w __builtin_msa_bnegi_w
+#define __msa_bnegi_d __builtin_msa_bnegi_d
+#define __msa_binsl_b __builtin_msa_binsl_b
+#define __msa_binsl_h __builtin_msa_binsl_h
+#define __msa_binsl_w __builtin_msa_binsl_w
+#define __msa_binsl_d __builtin_msa_binsl_d
+#define __msa_binsli_b __builtin_msa_binsli_b
+#define __msa_binsli_h __builtin_msa_binsli_h
+#define __msa_binsli_w __builtin_msa_binsli_w
+#define __msa_binsli_d __builtin_msa_binsli_d
+#define __msa_binsr_b __builtin_msa_binsr_b
+#define __msa_binsr_h __builtin_msa_binsr_h
+#define __msa_binsr_w __builtin_msa_binsr_w
+#define __msa_binsr_d __builtin_msa_binsr_d
+#define __msa_binsri_b __builtin_msa_binsri_b
+#define __msa_binsri_h __builtin_msa_binsri_h
+#define __msa_binsri_w __builtin_msa_binsri_w
+#define __msa_binsri_d __builtin_msa_binsri_d
+#define __msa_addv_b __builtin_msa_addv_b
+#define __msa_addv_h __builtin_msa_addv_h
+#define __msa_addv_w __builtin_msa_addv_w
+#define __msa_addv_d __builtin_msa_addv_d
+#define __msa_addvi_b __builtin_msa_addvi_b
+#define __msa_addvi_h __builtin_msa_addvi_h
+#define __msa_addvi_w __builtin_msa_addvi_w
+#define __msa_addvi_d __builtin_msa_addvi_d
+#define __msa_subv_b __builtin_msa_subv_b
+#define __msa_subv_h __builtin_msa_subv_h
+#define __msa_subv_w __builtin_msa_subv_w
+#define __msa_subv_d __builtin_msa_subv_d
+#define __msa_subvi_b __builtin_msa_subvi_b
+#define __msa_subvi_h __builtin_msa_subvi_h
+#define __msa_subvi_w __builtin_msa_subvi_w
+#define __msa_subvi_d __builtin_msa_subvi_d
+#define __msa_max_s_b __builtin_msa_max_s_b
+#define __msa_max_s_h __builtin_msa_max_s_h
+#define __msa_max_s_w __builtin_msa_max_s_w
+#define __msa_max_s_d __builtin_msa_max_s_d
+#define __msa_maxi_s_b __builtin_msa_maxi_s_b
+#define __msa_maxi_s_h __builtin_msa_maxi_s_h
+#define __msa_maxi_s_w __builtin_msa_maxi_s_w
+#define __msa_maxi_s_d __builtin_msa_maxi_s_d
+#define __msa_max_u_b __builtin_msa_max_u_b
+#define __msa_max_u_h __builtin_msa_max_u_h
+#define __msa_max_u_w __builtin_msa_max_u_w
+#define __msa_max_u_d __builtin_msa_max_u_d
+#define __msa_maxi_u_b __builtin_msa_maxi_u_b
+#define __msa_maxi_u_h __builtin_msa_maxi_u_h
+#define __msa_maxi_u_w __builtin_msa_maxi_u_w
+#define __msa_maxi_u_d __builtin_msa_maxi_u_d
+#define __msa_min_s_b __builtin_msa_min_s_b
+#define __msa_min_s_h __builtin_msa_min_s_h
+#define __msa_min_s_w __builtin_msa_min_s_w
+#define __msa_min_s_d __builtin_msa_min_s_d
+#define __msa_mini_s_b __builtin_msa_mini_s_b
+#define __msa_mini_s_h __builtin_msa_mini_s_h
+#define __msa_mini_s_w __builtin_msa_mini_s_w
+#define __msa_mini_s_d __builtin_msa_mini_s_d
+#define __msa_min_u_b __builtin_msa_min_u_b
+#define __msa_min_u_h __builtin_msa_min_u_h
+#define __msa_min_u_w __builtin_msa_min_u_w
+#define __msa_min_u_d __builtin_msa_min_u_d
+#define __msa_mini_u_b __builtin_msa_mini_u_b
+#define __msa_mini_u_h __builtin_msa_mini_u_h
+#define __msa_mini_u_w __builtin_msa_mini_u_w
+#define __msa_mini_u_d __builtin_msa_mini_u_d
+#define __msa_max_a_b __builtin_msa_max_a_b
+#define __msa_max_a_h __builtin_msa_max_a_h
+#define __msa_max_a_w __builtin_msa_max_a_w
+#define __msa_max_a_d __builtin_msa_max_a_d
+#define __msa_min_a_b __builtin_msa_min_a_b
+#define __msa_min_a_h __builtin_msa_min_a_h
+#define __msa_min_a_w __builtin_msa_min_a_w
+#define __msa_min_a_d __builtin_msa_min_a_d
+#define __msa_ceq_b __builtin_msa_ceq_b
+#define __msa_ceq_h __builtin_msa_ceq_h
+#define __msa_ceq_w __builtin_msa_ceq_w
+#define __msa_ceq_d __builtin_msa_ceq_d
+#define __msa_ceqi_b __builtin_msa_ceqi_b
+#define __msa_ceqi_h __builtin_msa_ceqi_h
+#define __msa_ceqi_w __builtin_msa_ceqi_w
+#define __msa_ceqi_d __builtin_msa_ceqi_d
+#define __msa_clt_s_b __builtin_msa_clt_s_b
+#define __msa_clt_s_h __builtin_msa_clt_s_h
+#define __msa_clt_s_w __builtin_msa_clt_s_w
+#define __msa_clt_s_d __builtin_msa_clt_s_d
+#define __msa_clti_s_b __builtin_msa_clti_s_b
+#define __msa_clti_s_h __builtin_msa_clti_s_h
+#define __msa_clti_s_w __builtin_msa_clti_s_w
+#define __msa_clti_s_d __builtin_msa_clti_s_d
+#define __msa_clt_u_b __builtin_msa_clt_u_b
+#define __msa_clt_u_h __builtin_msa_clt_u_h
+#define __msa_clt_u_w __builtin_msa_clt_u_w
+#define __msa_clt_u_d __builtin_msa_clt_u_d
+#define __msa_clti_u_b __builtin_msa_clti_u_b
+#define __msa_clti_u_h __builtin_msa_clti_u_h
+#define __msa_clti_u_w __builtin_msa_clti_u_w
+#define __msa_clti_u_d __builtin_msa_clti_u_d
+#define __msa_cle_s_b __builtin_msa_cle_s_b
+#define __msa_cle_s_h __builtin_msa_cle_s_h
+#define __msa_cle_s_w __builtin_msa_cle_s_w
+#define __msa_cle_s_d __builtin_msa_cle_s_d
+#define __msa_clei_s_b __builtin_msa_clei_s_b
+#define __msa_clei_s_h __builtin_msa_clei_s_h
+#define __msa_clei_s_w __builtin_msa_clei_s_w
+#define __msa_clei_s_d __builtin_msa_clei_s_d
+#define __msa_cle_u_b __builtin_msa_cle_u_b
+#define __msa_cle_u_h __builtin_msa_cle_u_h
+#define __msa_cle_u_w __builtin_msa_cle_u_w
+#define __msa_cle_u_d __builtin_msa_cle_u_d
+#define __msa_clei_u_b __builtin_msa_clei_u_b
+#define __msa_clei_u_h __builtin_msa_clei_u_h
+#define __msa_clei_u_w __builtin_msa_clei_u_w
+#define __msa_clei_u_d __builtin_msa_clei_u_d
+#define __msa_ld_b __builtin_msa_ld_b
+#define __msa_ld_h __builtin_msa_ld_h
+#define __msa_ld_w __builtin_msa_ld_w
+#define __msa_ld_d __builtin_msa_ld_d
+#define __msa_st_b __builtin_msa_st_b
+#define __msa_st_h __builtin_msa_st_h
+#define __msa_st_w __builtin_msa_st_w
+#define __msa_st_d __builtin_msa_st_d
+#define __msa_sat_s_b __builtin_msa_sat_s_b
+#define __msa_sat_s_h __builtin_msa_sat_s_h
+#define __msa_sat_s_w __builtin_msa_sat_s_w
+#define __msa_sat_s_d __builtin_msa_sat_s_d
+#define __msa_sat_u_b __builtin_msa_sat_u_b
+#define __msa_sat_u_h __builtin_msa_sat_u_h
+#define __msa_sat_u_w __builtin_msa_sat_u_w
+#define __msa_sat_u_d __builtin_msa_sat_u_d
+#define __msa_add_a_b __builtin_msa_add_a_b
+#define __msa_add_a_h __builtin_msa_add_a_h
+#define __msa_add_a_w __builtin_msa_add_a_w
+#define __msa_add_a_d __builtin_msa_add_a_d
+#define __msa_adds_a_b __builtin_msa_adds_a_b
+#define __msa_adds_a_h __builtin_msa_adds_a_h
+#define __msa_adds_a_w __builtin_msa_adds_a_w
+#define __msa_adds_a_d __builtin_msa_adds_a_d
+#define __msa_adds_s_b __builtin_msa_adds_s_b
+#define __msa_adds_s_h __builtin_msa_adds_s_h
+#define __msa_adds_s_w __builtin_msa_adds_s_w
+#define __msa_adds_s_d __builtin_msa_adds_s_d
+#define __msa_adds_u_b __builtin_msa_adds_u_b
+#define __msa_adds_u_h __builtin_msa_adds_u_h
+#define __msa_adds_u_w __builtin_msa_adds_u_w
+#define __msa_adds_u_d __builtin_msa_adds_u_d
+#define __msa_ave_s_b __builtin_msa_ave_s_b
+#define __msa_ave_s_h __builtin_msa_ave_s_h
+#define __msa_ave_s_w __builtin_msa_ave_s_w
+#define __msa_ave_s_d __builtin_msa_ave_s_d
+#define __msa_ave_u_b __builtin_msa_ave_u_b
+#define __msa_ave_u_h __builtin_msa_ave_u_h
+#define __msa_ave_u_w __builtin_msa_ave_u_w
+#define __msa_ave_u_d __builtin_msa_ave_u_d
+#define __msa_aver_s_b __builtin_msa_aver_s_b
+#define __msa_aver_s_h __builtin_msa_aver_s_h
+#define __msa_aver_s_w __builtin_msa_aver_s_w
+#define __msa_aver_s_d __builtin_msa_aver_s_d
+#define __msa_aver_u_b __builtin_msa_aver_u_b
+#define __msa_aver_u_h __builtin_msa_aver_u_h
+#define __msa_aver_u_w __builtin_msa_aver_u_w
+#define __msa_aver_u_d __builtin_msa_aver_u_d
+#define __msa_subs_s_b __builtin_msa_subs_s_b
+#define __msa_subs_s_h __builtin_msa_subs_s_h
+#define __msa_subs_s_w __builtin_msa_subs_s_w
+#define __msa_subs_s_d __builtin_msa_subs_s_d
+#define __msa_subs_u_b __builtin_msa_subs_u_b
+#define __msa_subs_u_h __builtin_msa_subs_u_h
+#define __msa_subs_u_w __builtin_msa_subs_u_w
+#define __msa_subs_u_d __builtin_msa_subs_u_d
+#define __msa_subsuu_s_b __builtin_msa_subsuu_s_b
+#define __msa_subsuu_s_h __builtin_msa_subsuu_s_h
+#define __msa_subsuu_s_w __builtin_msa_subsuu_s_w
+#define __msa_subsuu_s_d __builtin_msa_subsuu_s_d
+#define __msa_subsus_u_b __builtin_msa_subsus_u_b
+#define __msa_subsus_u_h __builtin_msa_subsus_u_h
+#define __msa_subsus_u_w __builtin_msa_subsus_u_w
+#define __msa_subsus_u_d __builtin_msa_subsus_u_d
+#define __msa_asub_s_b __builtin_msa_asub_s_b
+#define __msa_asub_s_h __builtin_msa_asub_s_h
+#define __msa_asub_s_w __builtin_msa_asub_s_w
+#define __msa_asub_s_d __builtin_msa_asub_s_d
+#define __msa_asub_u_b __builtin_msa_asub_u_b
+#define __msa_asub_u_h __builtin_msa_asub_u_h
+#define __msa_asub_u_w __builtin_msa_asub_u_w
+#define __msa_asub_u_d __builtin_msa_asub_u_d
+#define __msa_mulv_b __builtin_msa_mulv_b
+#define __msa_mulv_h __builtin_msa_mulv_h
+#define __msa_mulv_w __builtin_msa_mulv_w
+#define __msa_mulv_d __builtin_msa_mulv_d
+#define __msa_maddv_b __builtin_msa_maddv_b
+#define __msa_maddv_h __builtin_msa_maddv_h
+#define __msa_maddv_w __builtin_msa_maddv_w
+#define __msa_maddv_d __builtin_msa_maddv_d
+#define __msa_msubv_b __builtin_msa_msubv_b
+#define __msa_msubv_h __builtin_msa_msubv_h
+#define __msa_msubv_w __builtin_msa_msubv_w
+#define __msa_msubv_d __builtin_msa_msubv_d
+#define __msa_div_s_b __builtin_msa_div_s_b
+#define __msa_div_s_h __builtin_msa_div_s_h
+#define __msa_div_s_w __builtin_msa_div_s_w
+#define __msa_div_s_d __builtin_msa_div_s_d
+#define __msa_div_u_b __builtin_msa_div_u_b
+#define __msa_div_u_h __builtin_msa_div_u_h
+#define __msa_div_u_w __builtin_msa_div_u_w
+#define __msa_div_u_d __builtin_msa_div_u_d
+#define __msa_hadd_s_h __builtin_msa_hadd_s_h
+#define __msa_hadd_s_w __builtin_msa_hadd_s_w
+#define __msa_hadd_s_d __builtin_msa_hadd_s_d
+#define __msa_hadd_u_h __builtin_msa_hadd_u_h
+#define __msa_hadd_u_w __builtin_msa_hadd_u_w
+#define __msa_hadd_u_d __builtin_msa_hadd_u_d
+#define __msa_hsub_s_h __builtin_msa_hsub_s_h
+#define __msa_hsub_s_w __builtin_msa_hsub_s_w
+#define __msa_hsub_s_d __builtin_msa_hsub_s_d
+#define __msa_hsub_u_h __builtin_msa_hsub_u_h
+#define __msa_hsub_u_w __builtin_msa_hsub_u_w
+#define __msa_hsub_u_d __builtin_msa_hsub_u_d
+#define __msa_mod_s_b __builtin_msa_mod_s_b
+#define __msa_mod_s_h __builtin_msa_mod_s_h
+#define __msa_mod_s_w __builtin_msa_mod_s_w
+#define __msa_mod_s_d __builtin_msa_mod_s_d
+#define __msa_mod_u_b __builtin_msa_mod_u_b
+#define __msa_mod_u_h __builtin_msa_mod_u_h
+#define __msa_mod_u_w __builtin_msa_mod_u_w
+#define __msa_mod_u_d __builtin_msa_mod_u_d
+#define __msa_dotp_s_h __builtin_msa_dotp_s_h
+#define __msa_dotp_s_w __builtin_msa_dotp_s_w
+#define __msa_dotp_s_d __builtin_msa_dotp_s_d
+#define __msa_dotp_u_h __builtin_msa_dotp_u_h
+#define __msa_dotp_u_w __builtin_msa_dotp_u_w
+#define __msa_dotp_u_d __builtin_msa_dotp_u_d
+#define __msa_dpadd_s_h __builtin_msa_dpadd_s_h
+#define __msa_dpadd_s_w __builtin_msa_dpadd_s_w
+#define __msa_dpadd_s_d __builtin_msa_dpadd_s_d
+#define __msa_dpadd_u_h __builtin_msa_dpadd_u_h
+#define __msa_dpadd_u_w __builtin_msa_dpadd_u_w
+#define __msa_dpadd_u_d __builtin_msa_dpadd_u_d
+#define __msa_dpsub_s_h __builtin_msa_dpsub_s_h
+#define __msa_dpsub_s_w __builtin_msa_dpsub_s_w
+#define __msa_dpsub_s_d __builtin_msa_dpsub_s_d
+#define __msa_dpsub_u_h __builtin_msa_dpsub_u_h
+#define __msa_dpsub_u_w __builtin_msa_dpsub_u_w
+#define __msa_dpsub_u_d __builtin_msa_dpsub_u_d
+#define __msa_sld_b __builtin_msa_sld_b
+#define __msa_sld_h __builtin_msa_sld_h
+#define __msa_sld_w __builtin_msa_sld_w
+#define __msa_sld_d __builtin_msa_sld_d
+#define __msa_sldi_b __builtin_msa_sldi_b
+#define __msa_sldi_h __builtin_msa_sldi_h
+#define __msa_sldi_w __builtin_msa_sldi_w
+#define __msa_sldi_d __builtin_msa_sldi_d
+#define __msa_splat_b __builtin_msa_splat_b
+#define __msa_splat_h __builtin_msa_splat_h
+#define __msa_splat_w __builtin_msa_splat_w
+#define __msa_splat_d __builtin_msa_splat_d
+#define __msa_splati_b __builtin_msa_splati_b
+#define __msa_splati_h __builtin_msa_splati_h
+#define __msa_splati_w __builtin_msa_splati_w
+#define __msa_splati_d __builtin_msa_splati_d
+#define __msa_pckev_b __builtin_msa_pckev_b
+#define __msa_pckev_h __builtin_msa_pckev_h
+#define __msa_pckev_w __builtin_msa_pckev_w
+#define __msa_pckev_d __builtin_msa_pckev_d
+#define __msa_pckod_b __builtin_msa_pckod_b
+#define __msa_pckod_h __builtin_msa_pckod_h
+#define __msa_pckod_w __builtin_msa_pckod_w
+#define __msa_pckod_d __builtin_msa_pckod_d
+#define __msa_ilvl_b __builtin_msa_ilvl_b
+#define __msa_ilvl_h __builtin_msa_ilvl_h
+#define __msa_ilvl_w __builtin_msa_ilvl_w
+#define __msa_ilvl_d __builtin_msa_ilvl_d
+#define __msa_ilvr_b __builtin_msa_ilvr_b
+#define __msa_ilvr_h __builtin_msa_ilvr_h
+#define __msa_ilvr_w __builtin_msa_ilvr_w
+#define __msa_ilvr_d __builtin_msa_ilvr_d
+#define __msa_ilvev_b __builtin_msa_ilvev_b
+#define __msa_ilvev_h __builtin_msa_ilvev_h
+#define __msa_ilvev_w __builtin_msa_ilvev_w
+#define __msa_ilvev_d __builtin_msa_ilvev_d
+#define __msa_ilvod_b __builtin_msa_ilvod_b
+#define __msa_ilvod_h __builtin_msa_ilvod_h
+#define __msa_ilvod_w __builtin_msa_ilvod_w
+#define __msa_ilvod_d __builtin_msa_ilvod_d
+#define __msa_vshf_b __builtin_msa_vshf_b
+#define __msa_vshf_h __builtin_msa_vshf_h
+#define __msa_vshf_w __builtin_msa_vshf_w
+#define __msa_vshf_d __builtin_msa_vshf_d
+#define __msa_and_v __builtin_msa_and_v
+#define __msa_andi_b __builtin_msa_andi_b
+#define __msa_or_v __builtin_msa_or_v
+#define __msa_ori_b __builtin_msa_ori_b
+#define __msa_nor_v __builtin_msa_nor_v
+#define __msa_nori_b __builtin_msa_nori_b
+#define __msa_xor_v __builtin_msa_xor_v
+#define __msa_xori_b __builtin_msa_xori_b
+#define __msa_bmnz_v __builtin_msa_bmnz_v
+#define __msa_bmnzi_b __builtin_msa_bmnzi_b
+#define __msa_bmz_v __builtin_msa_bmz_v
+#define __msa_bmzi_b __builtin_msa_bmzi_b
+#define __msa_bsel_v __builtin_msa_bsel_v
+#define __msa_bseli_b __builtin_msa_bseli_b
+#define __msa_shf_b __builtin_msa_shf_b
+#define __msa_shf_h __builtin_msa_shf_h
+#define __msa_shf_w __builtin_msa_shf_w
+#define __msa_test_bnz_v __builtin_msa_bnz_v
+#define __msa_test_bz_v __builtin_msa_bz_v
+#define __msa_fill_b __builtin_msa_fill_b
+#define __msa_fill_h __builtin_msa_fill_h
+#define __msa_fill_w __builtin_msa_fill_w
+#define __msa_fill_d __builtin_msa_fill_d
+#define __msa_pcnt_b __builtin_msa_pcnt_b
+#define __msa_pcnt_h __builtin_msa_pcnt_h
+#define __msa_pcnt_w __builtin_msa_pcnt_w
+#define __msa_pcnt_d __builtin_msa_pcnt_d
+#define __msa_nloc_b __builtin_msa_nloc_b
+#define __msa_nloc_h __builtin_msa_nloc_h
+#define __msa_nloc_w __builtin_msa_nloc_w
+#define __msa_nloc_d __builtin_msa_nloc_d
+#define __msa_nlzc_b __builtin_msa_nlzc_b
+#define __msa_nlzc_h __builtin_msa_nlzc_h
+#define __msa_nlzc_w __builtin_msa_nlzc_w
+#define __msa_nlzc_d __builtin_msa_nlzc_d
+#define __msa_copy_s_b __builtin_msa_copy_s_b
+#define __msa_copy_s_h __builtin_msa_copy_s_h
+#define __msa_copy_s_w __builtin_msa_copy_s_w
+#define __msa_copy_s_d __builtin_msa_copy_s_d
+#define __msa_copy_u_b __builtin_msa_copy_u_b
+#define __msa_copy_u_h __builtin_msa_copy_u_h
+#define __msa_copy_u_w __builtin_msa_copy_u_w
+#define __msa_copy_u_d __builtin_msa_copy_u_d
+#define __msa_insert_b __builtin_msa_insert_b
+#define __msa_insert_h __builtin_msa_insert_h
+#define __msa_insert_w __builtin_msa_insert_w
+#define __msa_insert_d __builtin_msa_insert_d
+#define __msa_insve_b __builtin_msa_insve_b
+#define __msa_insve_h __builtin_msa_insve_h
+#define __msa_insve_w __builtin_msa_insve_w
+#define __msa_insve_d __builtin_msa_insve_d
+#define __msa_test_bnz_b __builtin_msa_bnz_b
+#define __msa_test_bnz_h __builtin_msa_bnz_h
+#define __msa_test_bnz_w __builtin_msa_bnz_w
+#define __msa_test_bnz_d __builtin_msa_bnz_d
+#define __msa_test_bz_b __builtin_msa_bz_b
+#define __msa_test_bz_h __builtin_msa_bz_h
+#define __msa_test_bz_w __builtin_msa_bz_w
+#define __msa_test_bz_d __builtin_msa_bz_d
+#define __msa_ldi_b __builtin_msa_ldi_b
+#define __msa_ldi_h __builtin_msa_ldi_h
+#define __msa_ldi_w __builtin_msa_ldi_w
+#define __msa_ldi_d __builtin_msa_ldi_d
+#define __msa_fcaf_w __builtin_msa_fcaf_w
+#define __msa_fcaf_d __builtin_msa_fcaf_d
+#define __msa_fcor_w __builtin_msa_fcor_w
+#define __msa_fcor_d __builtin_msa_fcor_d
+#define __msa_fcun_w __builtin_msa_fcun_w
+#define __msa_fcun_d __builtin_msa_fcun_d
+#define __msa_fcune_w __builtin_msa_fcune_w
+#define __msa_fcune_d __builtin_msa_fcune_d
+#define __msa_fcueq_w __builtin_msa_fcueq_w
+#define __msa_fcueq_d __builtin_msa_fcueq_d
+#define __msa_fceq_w __builtin_msa_fceq_w
+#define __msa_fceq_d __builtin_msa_fceq_d
+#define __msa_fcne_w __builtin_msa_fcne_w
+#define __msa_fcne_d __builtin_msa_fcne_d
+#define __msa_fclt_w __builtin_msa_fclt_w
+#define __msa_fclt_d __builtin_msa_fclt_d
+#define __msa_fcult_w __builtin_msa_fcult_w
+#define __msa_fcult_d __builtin_msa_fcult_d
+#define __msa_fcle_w __builtin_msa_fcle_w
+#define __msa_fcle_d __builtin_msa_fcle_d
+#define __msa_fcule_w __builtin_msa_fcule_w
+#define __msa_fcule_d __builtin_msa_fcule_d
+#define __msa_fsaf_w __builtin_msa_fsaf_w
+#define __msa_fsaf_d __builtin_msa_fsaf_d
+#define __msa_fsor_w __builtin_msa_fsor_w
+#define __msa_fsor_d __builtin_msa_fsor_d
+#define __msa_fsun_w __builtin_msa_fsun_w
+#define __msa_fsun_d __builtin_msa_fsun_d
+#define __msa_fsune_w __builtin_msa_fsune_w
+#define __msa_fsune_d __builtin_msa_fsune_d
+#define __msa_fsueq_w __builtin_msa_fsueq_w
+#define __msa_fsueq_d __builtin_msa_fsueq_d
+#define __msa_fseq_w __builtin_msa_fseq_w
+#define __msa_fseq_d __builtin_msa_fseq_d
+#define __msa_fsne_w __builtin_msa_fsne_w
+#define __msa_fsne_d __builtin_msa_fsne_d
+#define __msa_fslt_w __builtin_msa_fslt_w
+#define __msa_fslt_d __builtin_msa_fslt_d
+#define __msa_fsult_w __builtin_msa_fsult_w
+#define __msa_fsult_d __builtin_msa_fsult_d
+#define __msa_fsle_w __builtin_msa_fsle_w
+#define __msa_fsle_d __builtin_msa_fsle_d
+#define __msa_fsule_w __builtin_msa_fsule_w
+#define __msa_fsule_d __builtin_msa_fsule_d
+#define __msa_fadd_w __builtin_msa_fadd_w
+#define __msa_fadd_d __builtin_msa_fadd_d
+#define __msa_fsub_w __builtin_msa_fsub_w
+#define __msa_fsub_d __builtin_msa_fsub_d
+#define __msa_fmul_w __builtin_msa_fmul_w
+#define __msa_fmul_d __builtin_msa_fmul_d
+#define __msa_fdiv_w __builtin_msa_fdiv_w
+#define __msa_fdiv_d __builtin_msa_fdiv_d
+#define __msa_fmadd_w __builtin_msa_fmadd_w
+#define __msa_fmadd_d __builtin_msa_fmadd_d
+#define __msa_fmsub_w __builtin_msa_fmsub_w
+#define __msa_fmsub_d __builtin_msa_fmsub_d
+#define __msa_fexp2_w __builtin_msa_fexp2_w
+#define __msa_fexp2_d __builtin_msa_fexp2_d
+#define __msa_fexdo_h __builtin_msa_fexdo_h
+#define __msa_fexdo_w __builtin_msa_fexdo_w
+#define __msa_ftq_h __builtin_msa_ftq_h
+#define __msa_ftq_w __builtin_msa_ftq_w
+#define __msa_fmin_w __builtin_msa_fmin_w
+#define __msa_fmin_d __builtin_msa_fmin_d
+#define __msa_fmin_a_w __builtin_msa_fmin_a_w
+#define __msa_fmin_a_d __builtin_msa_fmin_a_d
+#define __msa_fmax_w __builtin_msa_fmax_w
+#define __msa_fmax_d __builtin_msa_fmax_d
+#define __msa_fmax_a_w __builtin_msa_fmax_a_w
+#define __msa_fmax_a_d __builtin_msa_fmax_a_d
+#define __msa_mul_q_h __builtin_msa_mul_q_h
+#define __msa_mul_q_w __builtin_msa_mul_q_w
+#define __msa_mulr_q_h __builtin_msa_mulr_q_h
+#define __msa_mulr_q_w __builtin_msa_mulr_q_w
+#define __msa_madd_q_h __builtin_msa_madd_q_h
+#define __msa_madd_q_w __builtin_msa_madd_q_w
+#define __msa_maddr_q_h __builtin_msa_maddr_q_h
+#define __msa_maddr_q_w __builtin_msa_maddr_q_w
+#define __msa_msub_q_h __builtin_msa_msub_q_h
+#define __msa_msub_q_w __builtin_msa_msub_q_w
+#define __msa_msubr_q_h __builtin_msa_msubr_q_h
+#define __msa_msubr_q_w __builtin_msa_msubr_q_w
+#define __msa_fclass_w __builtin_msa_fclass_w
+#define __msa_fclass_d __builtin_msa_fclass_d
+#define __msa_fsqrt_w __builtin_msa_fsqrt_w
+#define __msa_fsqrt_d __builtin_msa_fsqrt_d
+#define __msa_frcp_w __builtin_msa_frcp_w
+#define __msa_frcp_d __builtin_msa_frcp_d
+#define __msa_frint_w __builtin_msa_frint_w
+#define __msa_frint_d __builtin_msa_frint_d
+#define __msa_frsqrt_w __builtin_msa_frsqrt_w
+#define __msa_frsqrt_d __builtin_msa_frsqrt_d
+#define __msa_flog2_w __builtin_msa_flog2_w
+#define __msa_flog2_d __builtin_msa_flog2_d
+#define __msa_fexupl_w __builtin_msa_fexupl_w
+#define __msa_fexupl_d __builtin_msa_fexupl_d
+#define __msa_fexupr_w __builtin_msa_fexupr_w
+#define __msa_fexupr_d __builtin_msa_fexupr_d
+#define __msa_ffql_w __builtin_msa_ffql_w
+#define __msa_ffql_d __builtin_msa_ffql_d
+#define __msa_ffqr_w __builtin_msa_ffqr_w
+#define __msa_ffqr_d __builtin_msa_ffqr_d
+#define __msa_ftint_s_w __builtin_msa_ftint_s_w
+#define __msa_ftint_s_d __builtin_msa_ftint_s_d
+#define __msa_ftint_u_w __builtin_msa_ftint_u_w
+#define __msa_ftint_u_d __builtin_msa_ftint_u_d
+#define __msa_ftrunc_s_w __builtin_msa_ftrunc_s_w
+#define __msa_ftrunc_s_d __builtin_msa_ftrunc_s_d
+#define __msa_ftrunc_u_w __builtin_msa_ftrunc_u_w
+#define __msa_ftrunc_u_d __builtin_msa_ftrunc_u_d
+#define __msa_ffint_s_w __builtin_msa_ffint_s_w
+#define __msa_ffint_s_d __builtin_msa_ffint_s_d
+#define __msa_ffint_u_w __builtin_msa_ffint_u_w
+#define __msa_ffint_u_d __builtin_msa_ffint_u_d
+#define __msa_cfcmsa __builtin_msa_cfcmsa
+#define __msa_move_v __builtin_msa_move_v
+#define __msa_cast_to_vector_float __builtin_msa_cast_to_vector_float
+#define __msa_cast_to_vector_double __builtin_msa_cast_to_vector_double
+#define __msa_cast_to_scalar_float __builtin_msa_cast_to_scalar_float
+#define __msa_cast_to_scalar_double __builtin_msa_cast_to_scalar_double
+#endif /* defined(__mips_msa) */
+#endif /* _MSA_H */
diff --git a/darwin-x86/clang-headers/mwaitxintrin.h b/darwin-x86/clang-headers/mwaitxintrin.h
index 635f2ac..2921ead 100644
--- a/darwin-x86/clang-headers/mwaitxintrin.h
+++ b/darwin-x86/clang-headers/mwaitxintrin.h
@@ -25,8 +25,8 @@
 #error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
 #endif
 
-#ifndef _MWAITXINTRIN_H
-#define _MWAITXINTRIN_H
+#ifndef __MWAITXINTRIN_H
+#define __MWAITXINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
@@ -44,4 +44,4 @@
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* _MWAITXINTRIN_H */
+#endif /* __MWAITXINTRIN_H */
diff --git a/darwin-x86/clang-headers/nmmintrin.h b/darwin-x86/clang-headers/nmmintrin.h
index 57fec15..348fb8c 100644
--- a/darwin-x86/clang-headers/nmmintrin.h
+++ b/darwin-x86/clang-headers/nmmintrin.h
@@ -21,10 +21,10 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _NMMINTRIN_H
-#define _NMMINTRIN_H
+#ifndef __NMMINTRIN_H
+#define __NMMINTRIN_H
 
 /* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
    just include it now then.  */
 #include <smmintrin.h>
-#endif /* _NMMINTRIN_H */
+#endif /* __NMMINTRIN_H */
diff --git a/darwin-x86/clang-headers/omp.h b/darwin-x86/clang-headers/omp.h
new file mode 100644
index 0000000..c31a07e
--- /dev/null
+++ b/darwin-x86/clang-headers/omp.h
@@ -0,0 +1,215 @@
+/*
+ * include/50/omp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __OMP_H
+#   define __OMP_H
+
+#   define KMP_VERSION_MAJOR    5
+#   define KMP_VERSION_MINOR    0
+#   define KMP_VERSION_BUILD    20140926
+#   define KMP_BUILD_DATE       "No_Timestamp"
+
+#   ifdef __cplusplus
+    extern "C" {
+#   endif
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#   else
+#       define __KAI_KMPC_CONVENTION
+#   endif
+
+    /* schedule kind constants */
+    typedef enum omp_sched_t {
+	omp_sched_static  = 1,
+	omp_sched_dynamic = 2,
+	omp_sched_guided  = 3,
+	omp_sched_auto    = 4
+    } omp_sched_t;
+
+    /* set API functions */
+    extern void   __KAI_KMPC_CONVENTION  omp_set_num_threads (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_dynamic     (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nested      (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_max_active_levels (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_schedule          (omp_sched_t, int);
+
+    /* query API functions */
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_dynamic      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_nested       (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_num   (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_procs    (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_parallel      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_final         (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_active_level        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_level               (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_ancestor_thread_num (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_team_size           (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_limit        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_active_levels   (void);
+    extern void   __KAI_KMPC_CONVENTION  omp_get_schedule            (omp_sched_t *, int *);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_task_priority   (void);
+
+    /* lock API functions */
+    typedef struct omp_lock_t {
+        void * _lk;
+    } omp_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_lock    (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_lock     (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_lock   (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_lock (omp_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_lock    (omp_lock_t *);
+
+    /* nested lock API functions */
+    typedef struct omp_nest_lock_t {
+        void * _lk;
+    } omp_nest_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_nest_lock    (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nest_lock     (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_nest_lock   (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_nest_lock (omp_nest_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_nest_lock    (omp_nest_lock_t *);
+
+    /* lock hint type for dynamic user lock */
+    typedef enum omp_lock_hint_t {
+        omp_lock_hint_none           = 0,
+        omp_lock_hint_uncontended    = 1,
+        omp_lock_hint_contended      = (1<<1 ),
+        omp_lock_hint_nonspeculative = (1<<2 ),
+        omp_lock_hint_speculative    = (1<<3 ),
+        kmp_lock_hint_hle            = (1<<16),
+        kmp_lock_hint_rtm            = (1<<17),
+        kmp_lock_hint_adaptive       = (1<<18)
+    } omp_lock_hint_t;
+
+    /* hinted lock initializers */
+    extern void __KAI_KMPC_CONVENTION omp_init_lock_with_hint(omp_lock_t *, omp_lock_hint_t);
+    extern void __KAI_KMPC_CONVENTION omp_init_nest_lock_with_hint(omp_nest_lock_t *, omp_lock_hint_t);
+
+    /* time API functions */
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtime (void);
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtick (void);
+
+    /* OpenMP 4.0 */
+    extern int  __KAI_KMPC_CONVENTION  omp_get_default_device (void);
+    extern void __KAI_KMPC_CONVENTION  omp_set_default_device (int);
+    extern int  __KAI_KMPC_CONVENTION  omp_is_initial_device (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_devices (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_teams (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_team_num (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_cancellation (void);
+
+#   include <stdlib.h>
+    /* OpenMP 4.5 */
+    extern int   __KAI_KMPC_CONVENTION  omp_get_initial_device (void);
+    extern void* __KAI_KMPC_CONVENTION  omp_target_alloc(size_t, int);
+    extern void  __KAI_KMPC_CONVENTION  omp_target_free(void *, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_is_present(void *, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy(void *, void *, size_t, size_t, size_t, int, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy_rect(void *, void *, size_t, int, const size_t *,
+                                            const size_t *, const size_t *, const size_t *, const size_t *, int, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_associate_ptr(void *, void *, size_t, size_t, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_disassociate_ptr(void *, int);
+
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_disp_num_buffers   (int);
+
+    /* Intel affinity API */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    /* OpenMP 4.0 affinity API */
+    typedef enum omp_proc_bind_t {
+        omp_proc_bind_false = 0,
+        omp_proc_bind_true = 1,
+        omp_proc_bind_master = 2,
+        omp_proc_bind_close = 3,
+        omp_proc_bind_spread = 4
+    } omp_proc_bind_t;
+
+    extern omp_proc_bind_t __KAI_KMPC_CONVENTION omp_get_proc_bind (void);
+
+    /* OpenMP 4.5 affinity API */
+    extern int  __KAI_KMPC_CONVENTION omp_get_num_places (void);
+    extern int  __KAI_KMPC_CONVENTION omp_get_place_num_procs (int);
+    extern void __KAI_KMPC_CONVENTION omp_get_place_proc_ids (int, int *);
+    extern int  __KAI_KMPC_CONVENTION omp_get_place_num (void);
+    extern int  __KAI_KMPC_CONVENTION omp_get_partition_num_places (void);
+    extern void __KAI_KMPC_CONVENTION omp_get_partition_place_nums (int *);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_aligned_malloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+    /* OpenMP 5.0 Tool Control */
+    typedef enum omp_control_tool_result_t {
+        omp_control_tool_notool = -2,
+        omp_control_tool_nocallback = -1,
+        omp_control_tool_success = 0,
+        omp_control_tool_ignored = 1
+    } omp_control_tool_result_t;
+
+    typedef enum omp_control_tool_t {
+        omp_control_tool_start = 1,
+        omp_control_tool_pause = 2,
+        omp_control_tool_flush = 3,
+        omp_control_tool_end = 4
+    } omp_control_tool_t;
+    
+    extern int __KAI_KMPC_CONVENTION omp_control_tool(int, int, void*);
+
+#   undef __KAI_KMPC_CONVENTION
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+    }
+#   endif
+
+#endif /* __OMP_H */
+
diff --git a/darwin-x86/clang-headers/ompt.h b/darwin-x86/clang-headers/ompt.h
new file mode 100644
index 0000000..21b4c46
--- /dev/null
+++ b/darwin-x86/clang-headers/ompt.h
@@ -0,0 +1,697 @@
+/*
+ * include/50/ompt.h.var
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <stddef.h>
+
+
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)      \
+    macro (ompt_enumerate_states)           \
+    macro (ompt_enumerate_mutex_impls)      \
+                                            \
+    macro (ompt_set_callback)               \
+    macro (ompt_get_callback)               \
+                                            \
+    macro (ompt_get_state)                  \
+                                            \
+    macro (ompt_get_parallel_info)          \
+    macro (ompt_get_task_info)              \
+    macro (ompt_get_thread_data)            \
+    macro (ompt_get_unique_id)              \
+                                            \
+    macro(ompt_get_num_procs)               \
+    macro(ompt_get_num_places)              \
+    macro(ompt_get_place_proc_ids)          \
+    macro(ompt_get_place_num)               \
+    macro(ompt_get_partition_place_nums)    \
+    macro(ompt_get_proc_id)                 \
+                                            \
+    macro(ompt_get_target_info)             \
+    macro(ompt_get_num_devices)
+
+#define FOREACH_OMP_STATE(macro)                                                                \
+                                                                                                \
+    /* first available state */                                                                 \
+    macro (omp_state_undefined, 0x102)      /* undefined thread state */                        \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (omp_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (omp_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (omp_state_work_reduction, 0x002) /* performing a reduction */                        \
+                                                                                                \
+    /* barrier wait states (16..31) */                                                          \
+    macro (omp_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (omp_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (omp_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (omp_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (omp_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+                                                                                                \
+    /* task wait states (32..63) */                                                             \
+    macro (omp_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (omp_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
+                                                                                                \
+    /* mutex wait states (64..127) */                                                           \
+    macro (omp_state_wait_mutex, 0x040)                                                         \
+    macro (omp_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (omp_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (omp_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (omp_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
+                                                                                                \
+    /* target wait states (128..255) */                                                         \
+    macro (omp_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (omp_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (omp_state_wait_target_update, 0x082) /* waiting for target update operation */       \
+                                                                                                \
+    /* misc (256..511) */                                                                       \
+    macro (omp_state_idle, 0x100)           /* waiting for work */                              \
+    macro (omp_state_overhead, 0x101)       /* overhead excluding wait states */                \
+                                                                                                \
+    /* implementation-specific states (512..) */
+
+
+#define FOREACH_KMP_MUTEX_IMPL(macro)                                                \
+    macro (ompt_mutex_impl_unknown, 0)     /* unknown implementation */              \
+    macro (kmp_mutex_impl_spin, 1)         /* based on spin */                       \
+    macro (kmp_mutex_impl_queuing, 2)      /* based on some fair policy */           \
+    macro (kmp_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
+
+#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+                                                                                                                         \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_thread_begin,      ompt_callback_thread_begin_t,       1) /* thread begin                    */ \
+    macro (ompt_callback_thread_end,        ompt_callback_thread_end_t,         2) /* thread end                      */ \
+                                                                                                                         \
+    macro (ompt_callback_parallel_begin,    ompt_callback_parallel_begin_t,     3) /* parallel begin                  */ \
+    macro (ompt_callback_parallel_end,      ompt_callback_parallel_end_t,       4) /* parallel end                    */ \
+                                                                                                                         \
+    macro (ompt_callback_task_create,       ompt_callback_task_create_t,        5) /* task begin                      */ \
+    macro (ompt_callback_task_schedule,     ompt_callback_task_schedule_t,      6) /* task schedule                   */ \
+    macro (ompt_callback_implicit_task,     ompt_callback_implicit_task_t,      7) /* implicit task                   */ \
+                                                                                                                         \
+    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
+    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
+    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
+                                                                                                                         \
+    macro (ompt_callback_control_tool,      ompt_callback_control_tool_t,      11) /* control tool                    */ \
+                                                                                                                         \
+    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
+    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
+                                                                                                                         \
+    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
+    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */ \
+                                                                                                                         \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_sync_region_wait,  ompt_callback_sync_region_t,       16) /* sync region wait begin or end   */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_released,    ompt_callback_mutex_t,             17) /* mutex released                  */ \
+                                                                                                                         \
+    macro (ompt_callback_task_dependences,  ompt_callback_task_dependences_t,  18) /* report task dependences         */ \
+    macro (ompt_callback_task_dependence,   ompt_callback_task_dependence_t,   19) /* report task dependence          */ \
+                                                                                                                         \
+    macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
+                                                                                                                         \
+    macro (ompt_callback_master,            ompt_callback_master_t,            21) /* task at master begin or end     */ \
+                                                                                                                         \
+    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
+                                                                                                                         \
+    macro (ompt_callback_sync_region,       ompt_callback_sync_region_t,       23) /* sync region begin or end        */ \
+                                                                                                                         \
+    macro (ompt_callback_lock_init,         ompt_callback_mutex_acquire_t,     24) /* lock init                       */ \
+    macro (ompt_callback_lock_destroy,      ompt_callback_mutex_t,             25) /* lock destroy                    */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_acquire,     ompt_callback_mutex_acquire_t,     26) /* mutex acquire                   */ \
+    macro (ompt_callback_mutex_acquired,    ompt_callback_mutex_t,             27) /* mutex acquired                  */ \
+                                                                                                                         \
+    macro (ompt_callback_nest_lock,         ompt_callback_nest_lock_t,         28) /* nest lock                       */ \
+                                                                                                                         \
+    macro (ompt_callback_flush,             ompt_callback_flush_t,             29) /* after executing flush           */ \
+                                                                                                                         \
+    macro (ompt_callback_cancel,            ompt_callback_cancel_t,            30) /* cancel innermost binding region */ \
+    macro (ompt_callback_idle,              ompt_callback_idle_t,              31) /* begin or end idle state         */
+
+
+
+/*****************************************************************************
+ * data types
+ *****************************************************************************/
+
+/*---------------------
+ * identifiers
+ *---------------------*/
+
+typedef uint64_t ompt_id_t;
+#define ompt_id_none 0
+
+typedef union ompt_data_t {
+  uint64_t value; /* data initialized by runtime to unique id */
+  void *ptr;      /* pointer under tool control */
+} ompt_data_t;
+
+static const ompt_data_t ompt_data_none = {0};
+
+typedef uint64_t omp_wait_id_t;
+static const omp_wait_id_t omp_wait_id_none = 0;
+
+typedef void ompt_device_t;
+
+/*---------------------
+ * omp_frame_t
+ *---------------------*/
+
+typedef struct omp_frame_t {
+    void *exit_frame;    /* next frame is user code     */
+    void *enter_frame;   /* previous frame is user code */
+} omp_frame_t;
+
+
+/*---------------------
+ * dependences types
+ *---------------------*/
+
+typedef enum ompt_task_dependence_flag_t {
+    // a two bit field for the dependence type
+    ompt_task_dependence_type_out   = 1,
+    ompt_task_dependence_type_in    = 2,
+    ompt_task_dependence_type_inout = 3,
+} ompt_task_dependence_flag_t;
+
+typedef struct ompt_task_dependence_t {
+    void *variable_addr;
+    unsigned int dependence_flags;
+} ompt_task_dependence_t;
+
+
+/*****************************************************************************
+ * enumerations for thread states and runtime events
+ *****************************************************************************/
+
+/*---------------------
+ * runtime states
+ *---------------------*/
+
+typedef enum {
+#define omp_state_macro(state, code) state = code,
+    FOREACH_OMP_STATE(omp_state_macro)
+#undef omp_state_macro
+} omp_state_t;
+
+
+/*---------------------
+ * runtime events
+ *---------------------*/
+
+typedef enum ompt_callbacks_e{
+#define ompt_event_macro(event, callback, eventid) event = eventid,
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+#undef ompt_event_macro
+} ompt_callbacks_t;
+
+
+/*---------------------
+ * set callback results
+ *---------------------*/
+typedef enum ompt_set_result_t {
+    ompt_set_error = 0,
+    ompt_set_never = 1,
+    ompt_set_sometimes = 2,
+    ompt_set_sometimes_paired = 3,
+    ompt_set_always = 4
+} ompt_set_result_t;
+
+
+/*----------------------
+ * mutex implementations
+ *----------------------*/
+typedef enum kmp_mutex_impl_t {
+#define kmp_mutex_impl_macro(impl, code) impl = code,
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+} kmp_mutex_impl_t;
+
+
+/*****************************************************************************
+ * callback signatures
+ *****************************************************************************/
+
+/* initialization */
+typedef void (*ompt_interface_fn_t)(void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t)(
+    const char *                          /* entry point to look up              */
+);
+
+/* threads */
+typedef enum ompt_thread_type_t {
+    ompt_thread_initial = 1, // start the enumeration at 1
+    ompt_thread_worker  = 2,
+    ompt_thread_other   = 3,
+    ompt_thread_unknown = 4
+} ompt_thread_type_t;
+
+typedef enum ompt_invoker_t {
+    ompt_invoker_program = 1,             /* program invokes master task         */
+    ompt_invoker_runtime = 2              /* runtime invokes master task         */
+} ompt_invoker_t;
+
+typedef void (*ompt_callback_thread_begin_t) (
+    ompt_thread_type_t thread_type,       /* type of thread                      */
+    ompt_data_t *thread_data              /* data of thread                      */
+);
+
+typedef void (*ompt_callback_thread_end_t) (
+    ompt_data_t *thread_data              /* data of thread                      */
+);
+
+typedef void (*ompt_wait_callback_t) (
+    omp_wait_id_t wait_id                /* wait data                           */
+);
+
+/* parallel and workshares */
+typedef enum ompt_scope_endpoint_t {
+    ompt_scope_begin = 1,
+    ompt_scope_end   = 2
+} ompt_scope_endpoint_t;
+
+
+/* implicit task */
+typedef void (*ompt_callback_implicit_task_t) (
+    ompt_scope_endpoint_t endpoint,       /* endpoint of implicit task           */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of implicit task               */
+    unsigned int team_size,               /* team size                           */
+    unsigned int thread_num               /* thread number of calling thread     */
+);
+
+typedef void (*ompt_callback_parallel_begin_t) (
+    ompt_data_t *encountering_task_data,         /* data of encountering task           */
+    const omp_frame_t *encountering_task_frame,  /* frame data of encountering task     */
+    ompt_data_t *parallel_data,                  /* data of parallel region             */
+    unsigned int requested_team_size,            /* requested number of threads in team */
+    ompt_invoker_t invoker,                      /* invoker of master task              */
+    const void *codeptr_ra                       /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_parallel_end_t) (
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *encountering_task_data,  /* data of encountering task           */
+    ompt_invoker_t invoker,               /* invoker of master task              */ 
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+/* tasks */
+typedef enum ompt_task_type_t {
+    ompt_task_initial    = 0x1,
+    ompt_task_implicit   = 0x2,
+    ompt_task_explicit   = 0x4,
+    ompt_task_target     = 0x8,
+    ompt_task_undeferred = 0x8000000,
+    ompt_task_untied     = 0x10000000,
+    ompt_task_final      = 0x20000000,
+    ompt_task_mergeable  = 0x40000000,
+    ompt_task_merged     = 0x80000000
+} ompt_task_type_t;
+
+typedef enum ompt_task_status_t {
+    ompt_task_complete = 1,
+    ompt_task_yield    = 2,
+    ompt_task_cancel   = 3,
+    ompt_task_others   = 4
+} ompt_task_status_t;
+
+typedef void (*ompt_callback_task_schedule_t) (
+    ompt_data_t *prior_task_data,         /* data of prior task                  */
+    ompt_task_status_t prior_task_status, /* status of prior task                */
+    ompt_data_t *next_task_data           /* data of next task                   */
+);
+
+typedef void (*ompt_callback_task_create_t) (
+    ompt_data_t *encountering_task_data,         /* data of parent task                 */
+    const omp_frame_t *encountering_task_frame,  /* frame data for parent task          */
+    ompt_data_t *new_task_data,                  /* data of created task                */
+    int type,                                    /* type of created task                */
+    int has_dependences,                         /* created task has dependences        */
+    const void *codeptr_ra                       /* return address of runtime call      */
+);
+
+/* task dependences */
+typedef void (*ompt_callback_task_dependences_t) (
+    ompt_data_t *task_data,               /* data of task                        */
+    const ompt_task_dependence_t *deps,   /* dependences of task                 */
+    int ndeps                             /* dependences count of task           */
+);
+
+typedef void (*ompt_callback_task_dependence_t) (
+    ompt_data_t *src_task_data,           /* data of source task                 */
+    ompt_data_t *sink_task_data           /* data of sink task                   */
+);
+
+/* target and device */
+typedef enum ompt_target_type_t {
+    ompt_target = 1,
+    ompt_target_enter_data = 2,
+    ompt_target_exit_data = 3,
+    ompt_target_update = 4
+} ompt_target_type_t;
+
+typedef void (*ompt_callback_target_t) (
+    ompt_target_type_t kind,
+    ompt_scope_endpoint_t endpoint,
+    uint64_t device_num,
+    ompt_data_t *task_data,
+    ompt_id_t target_id,
+    const void *codeptr_ra
+);
+
+typedef enum ompt_target_data_op_t {
+    ompt_target_data_alloc = 1,
+    ompt_target_data_transfer_to_dev = 2,
+    ompt_target_data_transfer_from_dev = 3,
+    ompt_target_data_delete = 4
+} ompt_target_data_op_t;
+
+typedef void (*ompt_callback_target_data_op_t) (
+    ompt_id_t target_id,
+    ompt_id_t host_op_id,
+    ompt_target_data_op_t optype,
+    void *host_addr,
+    void *device_addr,
+    size_t bytes
+);
+
+typedef void (*ompt_callback_target_submit_t) (
+    ompt_id_t target_id,
+    ompt_id_t host_op_id
+);
+
+typedef void (*ompt_callback_target_map_t) (
+    ompt_id_t target_id,
+    unsigned int nitems,
+    void **host_addr,
+    void **device_addr,
+    size_t *bytes,
+    unsigned int *mapping_flags
+);
+
+typedef void (*ompt_callback_device_initialize_t) (
+    uint64_t device_num,
+    const char *type,
+    ompt_device_t *device,
+    ompt_function_lookup_t lookup,
+    const char *documentation
+);
+
+typedef void (*ompt_callback_device_finalize_t) (
+    uint64_t device_num
+);
+
+typedef void (*ompt_callback_device_load_t) (
+    uint64_t device_num,
+    const char * filename,
+    int64_t offset_in_file,
+    void * vma_in_file,
+    size_t bytes,
+    void * host_addr,
+    void * device_addr,
+    uint64_t module_id
+);
+
+#define ompt_addr_unknown ((void *) ~0)
+
+typedef void (*ompt_callback_device_unload_t) (
+    uint64_t device_num,
+    uint64_t module_id
+);
+
+/* control_tool */
+typedef int (*ompt_callback_control_tool_t) (
+    uint64_t command,                     /* command of control call             */
+    uint64_t modifier,                    /* modifier of control call            */
+    void *arg,                            /* argument of control call            */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef enum ompt_mutex_kind_t {
+    ompt_mutex           = 0x10,
+    ompt_mutex_lock      = 0x11,
+    ompt_mutex_nest_lock = 0x12,
+    ompt_mutex_critical  = 0x13,
+    ompt_mutex_atomic    = 0x14,
+    ompt_mutex_ordered   = 0x20
+} ompt_mutex_kind_t;
+
+typedef void (*ompt_callback_mutex_acquire_t) (
+    ompt_mutex_kind_t kind,               /* mutex kind                          */
+    unsigned int hint,                    /* mutex hint                          */
+    unsigned int impl,                    /* mutex implementation                */
+    omp_wait_id_t wait_id,               /* id of object being awaited          */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_mutex_t) (
+    ompt_mutex_kind_t kind,               /* mutex kind                          */
+    omp_wait_id_t wait_id,               /* id of object being awaited          */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_nest_lock_t) (
+    ompt_scope_endpoint_t endpoint,       /* endpoint of nested lock             */
+    omp_wait_id_t wait_id,               /* id of object being awaited          */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_master_t) (
+    ompt_scope_endpoint_t endpoint,       /* endpoint of master region           */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of task                        */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_idle_t) (
+    ompt_scope_endpoint_t endpoint        /* endpoint of idle time               */
+);
+
+typedef enum ompt_work_type_t {
+    ompt_work_loop            = 1,
+    ompt_work_sections        = 2,
+    ompt_work_single_executor = 3,
+    ompt_work_single_other    = 4,
+    ompt_work_workshare       = 5,
+    ompt_work_distribute      = 6,
+    ompt_work_taskloop        = 7
+} ompt_work_type_t;
+
+typedef void (*ompt_callback_work_t) (
+    ompt_work_type_t wstype,              /* type of work region                 */
+    ompt_scope_endpoint_t endpoint,       /* endpoint of work region             */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of task                        */
+    uint64_t count,                       /* quantity of work                    */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef enum ompt_sync_region_kind_t {
+    ompt_sync_region_barrier   = 1,
+    ompt_sync_region_taskwait  = 2,
+    ompt_sync_region_taskgroup = 3
+} ompt_sync_region_kind_t;
+
+typedef void (*ompt_callback_sync_region_t) (
+    ompt_sync_region_kind_t kind,         /* kind of sync region                 */
+    ompt_scope_endpoint_t endpoint,       /* endpoint of sync region             */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of task                        */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef enum ompt_cancel_flag_t {
+    ompt_cancel_parallel       = 0x1,
+    ompt_cancel_sections       = 0x2,
+    ompt_cancel_do             = 0x4,
+    ompt_cancel_taskgroup      = 0x8,
+    ompt_cancel_activated      = 0x10,
+    ompt_cancel_detected       = 0x20,
+    ompt_cancel_discarded_task = 0x40
+} ompt_cancel_flag_t;
+
+typedef void (*ompt_callback_cancel_t) (
+    ompt_data_t *task_data,               /* data of task                        */
+    int flags,                            /* cancel flags                        */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_flush_t) (
+    ompt_data_t *thread_data,             /* data of thread                      */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+/****************************************************************************
+ * ompt API
+ ***************************************************************************/
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define OMPT_API_FNTYPE(fn) fn##_t
+
+#define OMPT_API_FUNCTION(return_type, fn, args)  \
+    typedef return_type (*OMPT_API_FNTYPE(fn)) args
+
+
+
+/****************************************************************************
+ * INQUIRY FUNCTIONS
+ ***************************************************************************/
+
+/* state */
+OMPT_API_FUNCTION(omp_state_t, ompt_get_state, (
+    omp_wait_id_t *wait_id
+));
+
+/* thread */
+OMPT_API_FUNCTION(ompt_data_t*, ompt_get_thread_data, (void));
+
+/* parallel region */
+OMPT_API_FUNCTION(int, ompt_get_parallel_info, (
+    int ancestor_level,
+    ompt_data_t **parallel_data,
+    int *team_size
+));
+
+/* task */
+OMPT_API_FUNCTION(int, ompt_get_task_info, (
+    int ancestor_level,
+    int *type,
+    ompt_data_t **task_data,
+    omp_frame_t **task_frame,
+    ompt_data_t **parallel_data,
+    int *thread_num
+));
+
+/* procs */
+OMPT_API_FUNCTION(int, ompt_get_num_procs, (void));
+
+/* places */
+OMPT_API_FUNCTION(int, ompt_get_num_places, (void));
+
+OMPT_API_FUNCTION(int, ompt_get_place_proc_ids, (
+    int place_num,
+    int ids_size,
+    int *ids
+));
+
+OMPT_API_FUNCTION(int, ompt_get_place_num, (void));
+
+OMPT_API_FUNCTION(int, ompt_get_partition_place_nums, (
+    int place_nums_size,
+    int *place_nums
+));
+
+/* proc_id */
+OMPT_API_FUNCTION(int, ompt_get_proc_id, (void));
+
+
+/****************************************************************************
+ * INITIALIZATION FUNCTIONS
+ ***************************************************************************/
+
+OMPT_API_FUNCTION(int, ompt_initialize, (
+    ompt_function_lookup_t ompt_fn_lookup,
+    ompt_data_t *tool_data
+));
+
+OMPT_API_FUNCTION(void, ompt_finalize, (
+    ompt_data_t *tool_data
+));
+
+typedef struct ompt_start_tool_result_t {
+    ompt_initialize_t initialize;
+    ompt_finalize_t finalize;
+    ompt_data_t tool_data;
+} ompt_start_tool_result_t;
+
+/* initialization interface to be defined by tool */
+#ifdef _WIN32
+__declspec(dllexport)
+#endif
+ompt_start_tool_result_t * ompt_start_tool(
+    unsigned int omp_version, 
+    const char * runtime_version
+);
+
+typedef void (*ompt_callback_t)(void);
+
+OMPT_API_FUNCTION(int, ompt_set_callback, (
+    ompt_callbacks_t which,
+    ompt_callback_t callback
+));
+
+OMPT_API_FUNCTION(int, ompt_get_callback, (
+    ompt_callbacks_t which,
+    ompt_callback_t *callback
+));
+
+
+
+/****************************************************************************
+ * MISCELLANEOUS FUNCTIONS
+ ***************************************************************************/
+
+/* state enumeration */
+OMPT_API_FUNCTION(int, ompt_enumerate_states, (
+    int current_state,
+    int *next_state,
+    const char **next_state_name
+));
+
+/* mutex implementation enumeration */
+OMPT_API_FUNCTION(int, ompt_enumerate_mutex_impls, (
+    int current_impl,
+    int *next_impl,
+    const char **next_impl_name
+));
+
+/* get_unique_id */
+OMPT_API_FUNCTION(uint64_t, ompt_get_unique_id, (void));
+
+#ifdef  __cplusplus
+};
+#endif
+
+/****************************************************************************
+ * TARGET
+ ***************************************************************************/
+
+ OMPT_API_FUNCTION(int, ompt_get_target_info, (
+    uint64_t *device_num,
+    ompt_id_t *target_id,
+    ompt_id_t *host_op_id
+));
+
+ OMPT_API_FUNCTION(int, ompt_get_num_devices, (void));
+
+#endif /* __OMPT__ */
diff --git a/darwin-x86/clang-headers/opencl-c.h b/darwin-x86/clang-headers/opencl-c.h
index 8029274..e481c79 100644
--- a/darwin-x86/clang-headers/opencl-c.h
+++ b/darwin-x86/clang-headers/opencl-c.h
@@ -16,7 +16,14 @@
 #endif //cl_khr_depth_images
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+#ifdef cl_khr_3d_image_writes
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#endif //cl_khr_3d_image_writes
+#endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0
+
 #define __ovld __attribute__((overloadable))
+#define __conv __attribute__((convergent))
 
 // Optimizations
 #define __purefn __attribute__((pure))
@@ -6577,777 +6584,85 @@
  * OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
  * Reinterprets a data type as another data type of the same size
  */
-char __ovld __cnfn as_char(char);
-char __ovld __cnfn as_char(uchar);
+#define as_char(x) __builtin_astype((x),   char)
+#define as_char2(x) __builtin_astype((x),  char2)
+#define as_char3(x) __builtin_astype((x),  char3)
+#define as_char4(x) __builtin_astype((x),  char4)
+#define as_char8(x) __builtin_astype((x),  char8)
+#define as_char16(x) __builtin_astype((x), char16)
 
-char2 __ovld __cnfn as_char2(char2);
-char2 __ovld __cnfn as_char2(uchar2);
-char2 __ovld __cnfn as_char2(short);
-char2 __ovld __cnfn as_char2(ushort);
+#define as_uchar(x) __builtin_astype((x),   uchar)
+#define as_uchar2(x) __builtin_astype((x),  uchar2)
+#define as_uchar3(x) __builtin_astype((x),  uchar3)
+#define as_uchar4(x) __builtin_astype((x),  uchar4)
+#define as_uchar8(x) __builtin_astype((x),  uchar8)
+#define as_uchar16(x) __builtin_astype((x), uchar16)
 
-char3 __ovld __cnfn as_char3(char3);
-char3 __ovld __cnfn as_char3(char4);
-char3 __ovld __cnfn as_char3(uchar3);
-char3 __ovld __cnfn as_char3(uchar4);
-char3 __ovld __cnfn as_char3(short2);
-char3 __ovld __cnfn as_char3(ushort2);
-char3 __ovld __cnfn as_char3(int);
-char3 __ovld __cnfn as_char3(uint);
-char3 __ovld __cnfn as_char3(float);
+#define as_short(x) __builtin_astype((x),   short)
+#define as_short2(x) __builtin_astype((x),  short2)
+#define as_short3(x) __builtin_astype((x),  short3)
+#define as_short4(x) __builtin_astype((x),  short4)
+#define as_short8(x) __builtin_astype((x),  short8)
+#define as_short16(x) __builtin_astype((x), short16)
 
-char4 __ovld __cnfn as_char4(char3);
-char4 __ovld __cnfn as_char4(char4);
-char4 __ovld __cnfn as_char4(uchar3);
-char4 __ovld __cnfn as_char4(uchar4);
-char4 __ovld __cnfn as_char4(short2);
-char4 __ovld __cnfn as_char4(ushort2);
-char4 __ovld __cnfn as_char4(int);
-char4 __ovld __cnfn as_char4(uint);
-char4 __ovld __cnfn as_char4(float);
+#define as_ushort(x) __builtin_astype((x),   ushort)
+#define as_ushort2(x) __builtin_astype((x),  ushort2)
+#define as_ushort3(x) __builtin_astype((x),  ushort3)
+#define as_ushort4(x) __builtin_astype((x),  ushort4)
+#define as_ushort8(x) __builtin_astype((x),  ushort8)
+#define as_ushort16(x) __builtin_astype((x), ushort16)
 
-char8 __ovld __cnfn as_char8(char8);
-char8 __ovld __cnfn as_char8(uchar8);
-char8 __ovld __cnfn as_char8(short3);
-char8 __ovld __cnfn as_char8(short4);
-char8 __ovld __cnfn as_char8(ushort3);
-char8 __ovld __cnfn as_char8(ushort4);
-char8 __ovld __cnfn as_char8(int2);
-char8 __ovld __cnfn as_char8(uint2);
-char8 __ovld __cnfn as_char8(long);
-char8 __ovld __cnfn as_char8(ulong);
-char8 __ovld __cnfn as_char8(float2);
+#define as_int(x) __builtin_astype((x),   int)
+#define as_int2(x) __builtin_astype((x),  int2)
+#define as_int3(x) __builtin_astype((x),  int3)
+#define as_int4(x) __builtin_astype((x),  int4)
+#define as_int8(x) __builtin_astype((x),  int8)
+#define as_int16(x) __builtin_astype((x), int16)
 
-char16 __ovld __cnfn as_char16(char16);
-char16 __ovld __cnfn as_char16(uchar16);
-char16 __ovld __cnfn as_char16(short8);
-char16 __ovld __cnfn as_char16(ushort8);
-char16 __ovld __cnfn as_char16(int3);
-char16 __ovld __cnfn as_char16(int4);
-char16 __ovld __cnfn as_char16(uint3);
-char16 __ovld __cnfn as_char16(uint4);
-char16 __ovld __cnfn as_char16(long2);
-char16 __ovld __cnfn as_char16(ulong2);
-char16 __ovld __cnfn as_char16(float3);
-char16 __ovld __cnfn as_char16(float4);
+#define as_uint(x) __builtin_astype((x),   uint)
+#define as_uint2(x) __builtin_astype((x),  uint2)
+#define as_uint3(x) __builtin_astype((x),  uint3)
+#define as_uint4(x) __builtin_astype((x),  uint4)
+#define as_uint8(x) __builtin_astype((x),  uint8)
+#define as_uint16(x) __builtin_astype((x), uint16)
 
-uchar __ovld __cnfn as_uchar(char);
-uchar __ovld __cnfn as_uchar(uchar);
+#define as_long(x) __builtin_astype((x),   long)
+#define as_long2(x) __builtin_astype((x),  long2)
+#define as_long3(x) __builtin_astype((x),  long3)
+#define as_long4(x) __builtin_astype((x),  long4)
+#define as_long8(x) __builtin_astype((x),  long8)
+#define as_long16(x) __builtin_astype((x), long16)
 
-uchar2 __ovld __cnfn as_uchar2(char2);
-uchar2 __ovld __cnfn as_uchar2(uchar2);
-uchar2 __ovld __cnfn as_uchar2(short);
-uchar2 __ovld __cnfn as_uchar2(ushort);
+#define as_ulong(x) __builtin_astype((x),   ulong)
+#define as_ulong2(x) __builtin_astype((x),  ulong2)
+#define as_ulong3(x) __builtin_astype((x),  ulong3)
+#define as_ulong4(x) __builtin_astype((x),  ulong4)
+#define as_ulong8(x) __builtin_astype((x),  ulong8)
+#define as_ulong16(x) __builtin_astype((x), ulong16)
 
-uchar3 __ovld __cnfn as_uchar3(char3);
-uchar3 __ovld __cnfn as_uchar3(char4);
-uchar3 __ovld __cnfn as_uchar3(uchar3);
-uchar3 __ovld __cnfn as_uchar3(uchar4);
-uchar3 __ovld __cnfn as_uchar3(short2);
-uchar3 __ovld __cnfn as_uchar3(ushort2);
-uchar3 __ovld __cnfn as_uchar3(int);
-uchar3 __ovld __cnfn as_uchar3(uint);
-uchar3 __ovld __cnfn as_uchar3(float);
-
-uchar4 __ovld __cnfn as_uchar4(char3);
-uchar4 __ovld __cnfn as_uchar4(char4);
-uchar4 __ovld __cnfn as_uchar4(uchar3);
-uchar4 __ovld __cnfn as_uchar4(uchar4);
-uchar4 __ovld __cnfn as_uchar4(short2);
-uchar4 __ovld __cnfn as_uchar4(ushort2);
-uchar4 __ovld __cnfn as_uchar4(int);
-uchar4 __ovld __cnfn as_uchar4(uint);
-uchar4 __ovld __cnfn as_uchar4(float);
-
-uchar8 __ovld __cnfn as_uchar8(char8);
-uchar8 __ovld __cnfn as_uchar8(uchar8);
-uchar8 __ovld __cnfn as_uchar8(short3);
-uchar8 __ovld __cnfn as_uchar8(short4);
-uchar8 __ovld __cnfn as_uchar8(ushort3);
-uchar8 __ovld __cnfn as_uchar8(ushort4);
-uchar8 __ovld __cnfn as_uchar8(int2);
-uchar8 __ovld __cnfn as_uchar8(uint2);
-uchar8 __ovld __cnfn as_uchar8(long);
-uchar8 __ovld __cnfn as_uchar8(ulong);
-uchar8 __ovld __cnfn as_uchar8(float2);
-
-uchar16 __ovld __cnfn as_uchar16(char16);
-uchar16 __ovld __cnfn as_uchar16(uchar16);
-uchar16 __ovld __cnfn as_uchar16(short8);
-uchar16 __ovld __cnfn as_uchar16(ushort8);
-uchar16 __ovld __cnfn as_uchar16(int3);
-uchar16 __ovld __cnfn as_uchar16(int4);
-uchar16 __ovld __cnfn as_uchar16(uint3);
-uchar16 __ovld __cnfn as_uchar16(uint4);
-uchar16 __ovld __cnfn as_uchar16(long2);
-uchar16 __ovld __cnfn as_uchar16(ulong2);
-uchar16 __ovld __cnfn as_uchar16(float3);
-uchar16 __ovld __cnfn as_uchar16(float4);
-
-short __ovld __cnfn as_short(char2);
-short __ovld __cnfn as_short(uchar2);
-short __ovld __cnfn as_short(short);
-short __ovld __cnfn as_short(ushort);
-
-short2 __ovld __cnfn as_short2(char3);
-short2 __ovld __cnfn as_short2(char4);
-short2 __ovld __cnfn as_short2(uchar3);
-short2 __ovld __cnfn as_short2(uchar4);
-short2 __ovld __cnfn as_short2(short2);
-short2 __ovld __cnfn as_short2(ushort2);
-short2 __ovld __cnfn as_short2(int);
-short2 __ovld __cnfn as_short2(uint);
-short2 __ovld __cnfn as_short2(float);
-
-short3 __ovld __cnfn as_short3(char8);
-short3 __ovld __cnfn as_short3(uchar8);
-short3 __ovld __cnfn as_short3(short3);
-short3 __ovld __cnfn as_short3(short4);
-short3 __ovld __cnfn as_short3(ushort3);
-short3 __ovld __cnfn as_short3(ushort4);
-short3 __ovld __cnfn as_short3(int2);
-short3 __ovld __cnfn as_short3(uint2);
-short3 __ovld __cnfn as_short3(long);
-short3 __ovld __cnfn as_short3(ulong);
-short3 __ovld __cnfn as_short3(float2);
-
-short4 __ovld __cnfn as_short4(char8);
-short4 __ovld __cnfn as_short4(uchar8);
-short4 __ovld __cnfn as_short4(short3);
-short4 __ovld __cnfn as_short4(short4);
-short4 __ovld __cnfn as_short4(ushort3);
-short4 __ovld __cnfn as_short4(ushort4);
-short4 __ovld __cnfn as_short4(int2);
-short4 __ovld __cnfn as_short4(uint2);
-short4 __ovld __cnfn as_short4(long);
-short4 __ovld __cnfn as_short4(ulong);
-short4 __ovld __cnfn as_short4(float2);
-
-short8 __ovld __cnfn as_short8(char16);
-short8 __ovld __cnfn as_short8(uchar16);
-short8 __ovld __cnfn as_short8(short8);
-short8 __ovld __cnfn as_short8(ushort8);
-short8 __ovld __cnfn as_short8(int3);
-short8 __ovld __cnfn as_short8(int4);
-short8 __ovld __cnfn as_short8(uint3);
-short8 __ovld __cnfn as_short8(uint4);
-short8 __ovld __cnfn as_short8(long2);
-short8 __ovld __cnfn as_short8(ulong2);
-short8 __ovld __cnfn as_short8(float3);
-short8 __ovld __cnfn as_short8(float4);
-
-short16 __ovld __cnfn as_short16(short16);
-short16 __ovld __cnfn as_short16(ushort16);
-short16 __ovld __cnfn as_short16(int8);
-short16 __ovld __cnfn as_short16(uint8);
-short16 __ovld __cnfn as_short16(long3);
-short16 __ovld __cnfn as_short16(long4);
-short16 __ovld __cnfn as_short16(ulong3);
-short16 __ovld __cnfn as_short16(ulong4);
-short16 __ovld __cnfn as_short16(float8);
-
-ushort __ovld __cnfn as_ushort(char2);
-ushort __ovld __cnfn as_ushort(uchar2);
-ushort __ovld __cnfn as_ushort(short);
-ushort __ovld __cnfn as_ushort(ushort);
-
-ushort2 __ovld __cnfn as_ushort2(char3);
-ushort2 __ovld __cnfn as_ushort2(char4);
-ushort2 __ovld __cnfn as_ushort2(uchar3);
-ushort2 __ovld __cnfn as_ushort2(uchar4);
-ushort2 __ovld __cnfn as_ushort2(short2);
-ushort2 __ovld __cnfn as_ushort2(ushort2);
-ushort2 __ovld __cnfn as_ushort2(int);
-ushort2 __ovld __cnfn as_ushort2(uint);
-ushort2 __ovld __cnfn as_ushort2(float);
-
-ushort3 __ovld __cnfn as_ushort3(char8);
-ushort3 __ovld __cnfn as_ushort3(uchar8);
-ushort3 __ovld __cnfn as_ushort3(short3);
-ushort3 __ovld __cnfn as_ushort3(short4);
-ushort3 __ovld __cnfn as_ushort3(ushort3);
-ushort3 __ovld __cnfn as_ushort3(ushort4);
-ushort3 __ovld __cnfn as_ushort3(int2);
-ushort3 __ovld __cnfn as_ushort3(uint2);
-ushort3 __ovld __cnfn as_ushort3(long);
-ushort3 __ovld __cnfn as_ushort3(ulong);
-ushort3 __ovld __cnfn as_ushort3(float2);
-
-ushort4 __ovld __cnfn as_ushort4(char8);
-ushort4 __ovld __cnfn as_ushort4(uchar8);
-ushort4 __ovld __cnfn as_ushort4(short3);
-ushort4 __ovld __cnfn as_ushort4(short4);
-ushort4 __ovld __cnfn as_ushort4(ushort3);
-ushort4 __ovld __cnfn as_ushort4(ushort4);
-ushort4 __ovld __cnfn as_ushort4(int2);
-ushort4 __ovld __cnfn as_ushort4(uint2);
-ushort4 __ovld __cnfn as_ushort4(long);
-ushort4 __ovld __cnfn as_ushort4(ulong);
-ushort4 __ovld __cnfn as_ushort4(float2);
-
-ushort8 __ovld __cnfn as_ushort8(char16);
-ushort8 __ovld __cnfn as_ushort8(uchar16);
-ushort8 __ovld __cnfn as_ushort8(short8);
-ushort8 __ovld __cnfn as_ushort8(ushort8);
-ushort8 __ovld __cnfn as_ushort8(int3);
-ushort8 __ovld __cnfn as_ushort8(int4);
-ushort8 __ovld __cnfn as_ushort8(uint3);
-ushort8 __ovld __cnfn as_ushort8(uint4);
-ushort8 __ovld __cnfn as_ushort8(long2);
-ushort8 __ovld __cnfn as_ushort8(ulong2);
-ushort8 __ovld __cnfn as_ushort8(float3);
-ushort8 __ovld __cnfn as_ushort8(float4);
-
-ushort16 __ovld __cnfn as_ushort16(short16);
-ushort16 __ovld __cnfn as_ushort16(ushort16);
-ushort16 __ovld __cnfn as_ushort16(int8);
-ushort16 __ovld __cnfn as_ushort16(uint8);
-ushort16 __ovld __cnfn as_ushort16(long3);
-ushort16 __ovld __cnfn as_ushort16(long4);
-ushort16 __ovld __cnfn as_ushort16(ulong3);
-ushort16 __ovld __cnfn as_ushort16(ulong4);
-ushort16 __ovld __cnfn as_ushort16(float8);
-
-int __ovld __cnfn as_int(char3);
-int __ovld __cnfn as_int(char4);
-int __ovld __cnfn as_int(uchar3);
-int __ovld __cnfn as_int(uchar4);
-int __ovld __cnfn as_int(short2);
-int __ovld __cnfn as_int(ushort2);
-int __ovld __cnfn as_int(int);
-int __ovld __cnfn as_int(uint);
-int __ovld __cnfn as_int(float);
-
-int2 __ovld __cnfn as_int2(char8);
-int2 __ovld __cnfn as_int2(uchar8);
-int2 __ovld __cnfn as_int2(short3);
-int2 __ovld __cnfn as_int2(short4);
-int2 __ovld __cnfn as_int2(ushort3);
-int2 __ovld __cnfn as_int2(ushort4);
-int2 __ovld __cnfn as_int2(int2);
-int2 __ovld __cnfn as_int2(uint2);
-int2 __ovld __cnfn as_int2(long);
-int2 __ovld __cnfn as_int2(ulong);
-int2 __ovld __cnfn as_int2(float2);
-
-int3 __ovld __cnfn as_int3(char16);
-int3 __ovld __cnfn as_int3(uchar16);
-int3 __ovld __cnfn as_int3(short8);
-int3 __ovld __cnfn as_int3(ushort8);
-int3 __ovld __cnfn as_int3(int3);
-int3 __ovld __cnfn as_int3(int4);
-int3 __ovld __cnfn as_int3(uint3);
-int3 __ovld __cnfn as_int3(uint4);
-int3 __ovld __cnfn as_int3(long2);
-int3 __ovld __cnfn as_int3(ulong2);
-int3 __ovld __cnfn as_int3(float3);
-int3 __ovld __cnfn as_int3(float4);
-
-int4 __ovld __cnfn as_int4(char16);
-int4 __ovld __cnfn as_int4(uchar16);
-int4 __ovld __cnfn as_int4(short8);
-int4 __ovld __cnfn as_int4(ushort8);
-int4 __ovld __cnfn as_int4(int3);
-int4 __ovld __cnfn as_int4(int4);
-int4 __ovld __cnfn as_int4(uint3);
-int4 __ovld __cnfn as_int4(uint4);
-int4 __ovld __cnfn as_int4(long2);
-int4 __ovld __cnfn as_int4(ulong2);
-int4 __ovld __cnfn as_int4(float3);
-int4 __ovld __cnfn as_int4(float4);
-
-int8 __ovld __cnfn as_int8(short16);
-int8 __ovld __cnfn as_int8(ushort16);
-int8 __ovld __cnfn as_int8(int8);
-int8 __ovld __cnfn as_int8(uint8);
-int8 __ovld __cnfn as_int8(long3);
-int8 __ovld __cnfn as_int8(long4);
-int8 __ovld __cnfn as_int8(ulong3);
-int8 __ovld __cnfn as_int8(ulong4);
-int8 __ovld __cnfn as_int8(float8);
-
-int16 __ovld __cnfn as_int16(int16);
-int16 __ovld __cnfn as_int16(uint16);
-int16 __ovld __cnfn as_int16(long8);
-int16 __ovld __cnfn as_int16(ulong8);
-int16 __ovld __cnfn as_int16(float16);
-
-uint __ovld __cnfn as_uint(char3);
-uint __ovld __cnfn as_uint(char4);
-uint __ovld __cnfn as_uint(uchar3);
-uint __ovld __cnfn as_uint(uchar4);
-uint __ovld __cnfn as_uint(short2);
-uint __ovld __cnfn as_uint(ushort2);
-uint __ovld __cnfn as_uint(int);
-uint __ovld __cnfn as_uint(uint);
-uint __ovld __cnfn as_uint(float);
-
-uint2 __ovld __cnfn as_uint2(char8);
-uint2 __ovld __cnfn as_uint2(uchar8);
-uint2 __ovld __cnfn as_uint2(short3);
-uint2 __ovld __cnfn as_uint2(short4);
-uint2 __ovld __cnfn as_uint2(ushort3);
-uint2 __ovld __cnfn as_uint2(ushort4);
-uint2 __ovld __cnfn as_uint2(int2);
-uint2 __ovld __cnfn as_uint2(uint2);
-uint2 __ovld __cnfn as_uint2(long);
-uint2 __ovld __cnfn as_uint2(ulong);
-uint2 __ovld __cnfn as_uint2(float2);
-
-uint3 __ovld __cnfn as_uint3(char16);
-uint3 __ovld __cnfn as_uint3(uchar16);
-uint3 __ovld __cnfn as_uint3(short8);
-uint3 __ovld __cnfn as_uint3(ushort8);
-uint3 __ovld __cnfn as_uint3(int3);
-uint3 __ovld __cnfn as_uint3(int4);
-uint3 __ovld __cnfn as_uint3(uint3);
-uint3 __ovld __cnfn as_uint3(uint4);
-uint3 __ovld __cnfn as_uint3(long2);
-uint3 __ovld __cnfn as_uint3(ulong2);
-uint3 __ovld __cnfn as_uint3(float3);
-uint3 __ovld __cnfn as_uint3(float4);
-
-uint4 __ovld __cnfn as_uint4(char16);
-uint4 __ovld __cnfn as_uint4(uchar16);
-uint4 __ovld __cnfn as_uint4(short8);
-uint4 __ovld __cnfn as_uint4(ushort8);
-uint4 __ovld __cnfn as_uint4(int3);
-uint4 __ovld __cnfn as_uint4(int4);
-uint4 __ovld __cnfn as_uint4(uint3);
-uint4 __ovld __cnfn as_uint4(uint4);
-uint4 __ovld __cnfn as_uint4(long2);
-uint4 __ovld __cnfn as_uint4(ulong2);
-uint4 __ovld __cnfn as_uint4(float3);
-uint4 __ovld __cnfn as_uint4(float4);
-
-uint8 __ovld __cnfn as_uint8(short16);
-uint8 __ovld __cnfn as_uint8(ushort16);
-uint8 __ovld __cnfn as_uint8(int8);
-uint8 __ovld __cnfn as_uint8(uint8);
-uint8 __ovld __cnfn as_uint8(long3);
-uint8 __ovld __cnfn as_uint8(long4);
-uint8 __ovld __cnfn as_uint8(ulong3);
-uint8 __ovld __cnfn as_uint8(ulong4);
-uint8 __ovld __cnfn as_uint8(float8);
-
-uint16 __ovld __cnfn as_uint16(int16);
-uint16 __ovld __cnfn as_uint16(uint16);
-uint16 __ovld __cnfn as_uint16(long8);
-uint16 __ovld __cnfn as_uint16(ulong8);
-uint16 __ovld __cnfn as_uint16(float16);
-
-long __ovld __cnfn as_long(char8);
-long __ovld __cnfn as_long(uchar8);
-long __ovld __cnfn as_long(short3);
-long __ovld __cnfn as_long(short4);
-long __ovld __cnfn as_long(ushort3);
-long __ovld __cnfn as_long(ushort4);
-long __ovld __cnfn as_long(int2);
-long __ovld __cnfn as_long(uint2);
-long __ovld __cnfn as_long(long);
-long __ovld __cnfn as_long(ulong);
-long __ovld __cnfn as_long(float2);
-
-long2 __ovld __cnfn as_long2(char16);
-long2 __ovld __cnfn as_long2(uchar16);
-long2 __ovld __cnfn as_long2(short8);
-long2 __ovld __cnfn as_long2(ushort8);
-long2 __ovld __cnfn as_long2(int3);
-long2 __ovld __cnfn as_long2(int4);
-long2 __ovld __cnfn as_long2(uint3);
-long2 __ovld __cnfn as_long2(uint4);
-long2 __ovld __cnfn as_long2(long2);
-long2 __ovld __cnfn as_long2(ulong2);
-long2 __ovld __cnfn as_long2(float3);
-long2 __ovld __cnfn as_long2(float4);
-
-long3 __ovld __cnfn as_long3(short16);
-long3 __ovld __cnfn as_long3(ushort16);
-long3 __ovld __cnfn as_long3(int8);
-long3 __ovld __cnfn as_long3(uint8);
-long3 __ovld __cnfn as_long3(long3);
-long3 __ovld __cnfn as_long3(long4);
-long3 __ovld __cnfn as_long3(ulong3);
-long3 __ovld __cnfn as_long3(ulong4);
-long3 __ovld __cnfn as_long3(float8);
-
-long4 __ovld __cnfn as_long4(short16);
-long4 __ovld __cnfn as_long4(ushort16);
-long4 __ovld __cnfn as_long4(int8);
-long4 __ovld __cnfn as_long4(uint8);
-long4 __ovld __cnfn as_long4(long3);
-long4 __ovld __cnfn as_long4(long4);
-long4 __ovld __cnfn as_long4(ulong3);
-long4 __ovld __cnfn as_long4(ulong4);
-long4 __ovld __cnfn as_long4(float8);
-
-long8 __ovld __cnfn as_long8(int16);
-long8 __ovld __cnfn as_long8(uint16);
-long8 __ovld __cnfn as_long8(long8);
-long8 __ovld __cnfn as_long8(ulong8);
-long8 __ovld __cnfn as_long8(float16);
-
-long16 __ovld __cnfn as_long16(long16);
-long16 __ovld __cnfn as_long16(ulong16);
-
-ulong __ovld __cnfn as_ulong(char8);
-ulong __ovld __cnfn as_ulong(uchar8);
-ulong __ovld __cnfn as_ulong(short3);
-ulong __ovld __cnfn as_ulong(short4);
-ulong __ovld __cnfn as_ulong(ushort3);
-ulong __ovld __cnfn as_ulong(ushort4);
-ulong __ovld __cnfn as_ulong(int2);
-ulong __ovld __cnfn as_ulong(uint2);
-ulong __ovld __cnfn as_ulong(long);
-ulong __ovld __cnfn as_ulong(ulong);
-ulong __ovld __cnfn as_ulong(float2);
-
-ulong2 __ovld __cnfn as_ulong2(char16);
-ulong2 __ovld __cnfn as_ulong2(uchar16);
-ulong2 __ovld __cnfn as_ulong2(short8);
-ulong2 __ovld __cnfn as_ulong2(ushort8);
-ulong2 __ovld __cnfn as_ulong2(int3);
-ulong2 __ovld __cnfn as_ulong2(int4);
-ulong2 __ovld __cnfn as_ulong2(uint3);
-ulong2 __ovld __cnfn as_ulong2(uint4);
-ulong2 __ovld __cnfn as_ulong2(long2);
-ulong2 __ovld __cnfn as_ulong2(ulong2);
-ulong2 __ovld __cnfn as_ulong2(float3);
-ulong2 __ovld __cnfn as_ulong2(float4);
-
-ulong3 __ovld __cnfn as_ulong3(short16);
-ulong3 __ovld __cnfn as_ulong3(ushort16);
-ulong3 __ovld __cnfn as_ulong3(int8);
-ulong3 __ovld __cnfn as_ulong3(uint8);
-ulong3 __ovld __cnfn as_ulong3(long3);
-ulong3 __ovld __cnfn as_ulong3(long4);
-ulong3 __ovld __cnfn as_ulong3(ulong3);
-ulong3 __ovld __cnfn as_ulong3(ulong4);
-ulong3 __ovld __cnfn as_ulong3(float8);
-
-ulong4 __ovld __cnfn as_ulong4(short16);
-ulong4 __ovld __cnfn as_ulong4(ushort16);
-ulong4 __ovld __cnfn as_ulong4(int8);
-ulong4 __ovld __cnfn as_ulong4(uint8);
-ulong4 __ovld __cnfn as_ulong4(long3);
-ulong4 __ovld __cnfn as_ulong4(long4);
-ulong4 __ovld __cnfn as_ulong4(ulong3);
-ulong4 __ovld __cnfn as_ulong4(ulong4);
-ulong4 __ovld __cnfn as_ulong4(float8);
-
-ulong8 __ovld __cnfn as_ulong8(int16);
-ulong8 __ovld __cnfn as_ulong8(uint16);
-ulong8 __ovld __cnfn as_ulong8(long8);
-ulong8 __ovld __cnfn as_ulong8(ulong8);
-ulong8 __ovld __cnfn as_ulong8(float16);
-
-ulong16 __ovld __cnfn as_ulong16(long16);
-ulong16 __ovld __cnfn as_ulong16(ulong16);
-
-float __ovld __cnfn as_float(char3);
-float __ovld __cnfn as_float(char4);
-float __ovld __cnfn as_float(uchar3);
-float __ovld __cnfn as_float(uchar4);
-float __ovld __cnfn as_float(short2);
-float __ovld __cnfn as_float(ushort2);
-float __ovld __cnfn as_float(int);
-float __ovld __cnfn as_float(uint);
-float __ovld __cnfn as_float(float);
-
-float2 __ovld __cnfn as_float2(char8);
-float2 __ovld __cnfn as_float2(uchar8);
-float2 __ovld __cnfn as_float2(short3);
-float2 __ovld __cnfn as_float2(short4);
-float2 __ovld __cnfn as_float2(ushort3);
-float2 __ovld __cnfn as_float2(ushort4);
-float2 __ovld __cnfn as_float2(int2);
-float2 __ovld __cnfn as_float2(uint2);
-float2 __ovld __cnfn as_float2(long);
-float2 __ovld __cnfn as_float2(ulong);
-float2 __ovld __cnfn as_float2(float2);
-
-float3 __ovld __cnfn as_float3(char16);
-float3 __ovld __cnfn as_float3(uchar16);
-float3 __ovld __cnfn as_float3(short8);
-float3 __ovld __cnfn as_float3(ushort8);
-float3 __ovld __cnfn as_float3(int3);
-float3 __ovld __cnfn as_float3(int4);
-float3 __ovld __cnfn as_float3(uint3);
-float3 __ovld __cnfn as_float3(uint4);
-float3 __ovld __cnfn as_float3(long2);
-float3 __ovld __cnfn as_float3(ulong2);
-float3 __ovld __cnfn as_float3(float3);
-float3 __ovld __cnfn as_float3(float4);
-
-float4 __ovld __cnfn as_float4(char16);
-float4 __ovld __cnfn as_float4(uchar16);
-float4 __ovld __cnfn as_float4(short8);
-float4 __ovld __cnfn as_float4(ushort8);
-float4 __ovld __cnfn as_float4(int3);
-float4 __ovld __cnfn as_float4(int4);
-float4 __ovld __cnfn as_float4(uint3);
-float4 __ovld __cnfn as_float4(uint4);
-float4 __ovld __cnfn as_float4(long2);
-float4 __ovld __cnfn as_float4(ulong2);
-float4 __ovld __cnfn as_float4(float3);
-float4 __ovld __cnfn as_float4(float4);
-
-float8 __ovld __cnfn as_float8(short16);
-float8 __ovld __cnfn as_float8(ushort16);
-float8 __ovld __cnfn as_float8(int8);
-float8 __ovld __cnfn as_float8(uint8);
-float8 __ovld __cnfn as_float8(long3);
-float8 __ovld __cnfn as_float8(long4);
-float8 __ovld __cnfn as_float8(ulong3);
-float8 __ovld __cnfn as_float8(ulong4);
-float8 __ovld __cnfn as_float8(float8);
-
-float16 __ovld __cnfn as_float16(int16);
-float16 __ovld __cnfn as_float16(uint16);
-float16 __ovld __cnfn as_float16(long8);
-float16 __ovld __cnfn as_float16(ulong8);
-float16 __ovld __cnfn as_float16(float16);
+#define as_float(x) __builtin_astype((x),   float)
+#define as_float2(x) __builtin_astype((x),  float2)
+#define as_float3(x) __builtin_astype((x),  float3)
+#define as_float4(x) __builtin_astype((x),  float4)
+#define as_float8(x) __builtin_astype((x),  float8)
+#define as_float16(x) __builtin_astype((x), float16)
 
 #ifdef cl_khr_fp64
-char8 __ovld __cnfn as_char8(double);
-char16 __ovld __cnfn as_char16(double2);
-uchar8 __ovld __cnfn as_uchar8(double);
-uchar16 __ovld __cnfn as_uchar16(double2);
-short3 __ovld __cnfn as_short3(double);
-short4 __ovld __cnfn as_short4(double);
-short8 __ovld __cnfn as_short8(double2);
-short16 __ovld __cnfn as_short16(double3);
-short16 __ovld __cnfn as_short16(double4);
-ushort3 __ovld __cnfn as_ushort3(double);
-ushort4 __ovld __cnfn as_ushort4(double);
-ushort8 __ovld __cnfn as_ushort8(double2);
-ushort16 __ovld __cnfn as_ushort16(double3);
-ushort16 __ovld __cnfn as_ushort16(double4);
-int2 __ovld __cnfn as_int2(double);
-int3 __ovld __cnfn as_int3(double2);
-int4 __ovld __cnfn as_int4(double2);
-int8 __ovld __cnfn as_int8(double3);
-int8 __ovld __cnfn as_int8(double4);
-int16 __ovld __cnfn as_int16(double8);
-uint2 __ovld __cnfn as_uint2(double);
-uint3 __ovld __cnfn as_uint3(double2);
-uint4 __ovld __cnfn as_uint4(double2);
-uint8 __ovld __cnfn as_uint8(double3);
-uint8 __ovld __cnfn as_uint8(double4);
-uint16 __ovld __cnfn as_uint16(double8);
-long __ovld __cnfn as_long(double);
-long2 __ovld __cnfn as_long2(double2);
-long3 __ovld __cnfn as_long3(double3);
-long3 __ovld __cnfn as_long3(double4);
-long4 __ovld __cnfn as_long4(double3);
-long4 __ovld __cnfn as_long4(double4);
-long8 __ovld __cnfn as_long8(double8);
-long16 __ovld __cnfn as_long16(double16);
-ulong __ovld __cnfn as_ulong(double);
-ulong2 __ovld __cnfn as_ulong2(double2);
-ulong3 __ovld __cnfn as_ulong3(double3);
-ulong3 __ovld __cnfn as_ulong3(double4);
-ulong4 __ovld __cnfn as_ulong4(double3);
-ulong4 __ovld __cnfn as_ulong4(double4);
-ulong8 __ovld __cnfn as_ulong8(double8);
-ulong16 __ovld __cnfn as_ulong16(double16);
-float2 __ovld __cnfn as_float2(double);
-float3 __ovld __cnfn as_float3(double2);
-float4 __ovld __cnfn as_float4(double2);
-float8 __ovld __cnfn as_float8(double3);
-float8 __ovld __cnfn as_float8(double4);
-float16 __ovld __cnfn as_float16(double8);
-double __ovld __cnfn as_double(char8);
-double __ovld __cnfn as_double(uchar8);
-double __ovld __cnfn as_double(short3);
-double __ovld __cnfn as_double(short4);
-double __ovld __cnfn as_double(ushort3);
-double __ovld __cnfn as_double(ushort4);
-double __ovld __cnfn as_double(int2);
-double __ovld __cnfn as_double(uint2);
-double __ovld __cnfn as_double(long);
-double __ovld __cnfn as_double(ulong);
-double __ovld __cnfn as_double(float2);
-double __ovld __cnfn as_double(double);
-double2 __ovld __cnfn as_double2(char16);
-double2 __ovld __cnfn as_double2(uchar16);
-double2 __ovld __cnfn as_double2(short8);
-double2 __ovld __cnfn as_double2(ushort8);
-double2 __ovld __cnfn as_double2(int3);
-double2 __ovld __cnfn as_double2(int4);
-double2 __ovld __cnfn as_double2(uint3);
-double2 __ovld __cnfn as_double2(uint4);
-double2 __ovld __cnfn as_double2(long2);
-double2 __ovld __cnfn as_double2(ulong2);
-double2 __ovld __cnfn as_double2(float3);
-double2 __ovld __cnfn as_double2(float4);
-double2 __ovld __cnfn as_double2(double2);
-double3 __ovld __cnfn as_double3(short16);
-double3 __ovld __cnfn as_double3(ushort16);
-double3 __ovld __cnfn as_double3(int8);
-double3 __ovld __cnfn as_double3(uint8);
-double3 __ovld __cnfn as_double3(long3);
-double3 __ovld __cnfn as_double3(long4);
-double3 __ovld __cnfn as_double3(ulong3);
-double3 __ovld __cnfn as_double3(ulong4);
-double3 __ovld __cnfn as_double3(float8);
-double3 __ovld __cnfn as_double3(double3);
-double3 __ovld __cnfn as_double3(double4);
-double4 __ovld __cnfn as_double4(short16);
-double4 __ovld __cnfn as_double4(ushort16);
-double4 __ovld __cnfn as_double4(int8);
-double4 __ovld __cnfn as_double4(uint8);
-double4 __ovld __cnfn as_double4(long3);
-double4 __ovld __cnfn as_double4(long4);
-double4 __ovld __cnfn as_double4(ulong3);
-double4 __ovld __cnfn as_double4(ulong4);
-double4 __ovld __cnfn as_double4(float8);
-double4 __ovld __cnfn as_double4(double3);
-double4 __ovld __cnfn as_double4(double4);
-double8 __ovld __cnfn as_double8(int16);
-double8 __ovld __cnfn as_double8(uint16);
-double8 __ovld __cnfn as_double8(long8);
-double8 __ovld __cnfn as_double8(ulong8);
-double8 __ovld __cnfn as_double8(float16);
-double8 __ovld __cnfn as_double8(double8);
-double16 __ovld __cnfn as_double16(long16);
-double16 __ovld __cnfn as_double16(ulong16);
-double16 __ovld __cnfn as_double16(double16);
+#define as_double(x) __builtin_astype((x),   double)
+#define as_double2(x) __builtin_astype((x),  double2)
+#define as_double3(x) __builtin_astype((x),  double3)
+#define as_double4(x) __builtin_astype((x),  double4)
+#define as_double8(x) __builtin_astype((x),  double8)
+#define as_double16(x) __builtin_astype((x), double16)
 #endif //cl_khr_fp64
 
 #ifdef cl_khr_fp16
-char2 __ovld __cnfn as_char2(half);
-char3 __ovld __cnfn as_char3(half2);
-char4 __ovld __cnfn as_char4(half2);
-char8 __ovld __cnfn as_char8(half3);
-char8 __ovld __cnfn as_char8(half4);
-char16 __ovld __cnfn as_char16(half8);
-uchar2 __ovld __cnfn as_uchar2(half);
-uchar3 __ovld __cnfn as_uchar3(half2);
-uchar4 __ovld __cnfn as_uchar4(half2);
-uchar8 __ovld __cnfn as_uchar8(half3);
-uchar8 __ovld __cnfn as_uchar8(half4);
-uchar16 __ovld __cnfn as_uchar16(half8);
-short __ovld __cnfn as_short(half);
-short2 __ovld __cnfn as_short2(half2);
-short3 __ovld __cnfn as_short3(half3);
-short3 __ovld __cnfn as_short3(half4);
-short4 __ovld __cnfn as_short4(half3);
-short4 __ovld __cnfn as_short4(half4);
-short8 __ovld __cnfn as_short8(half8);
-short16 __ovld __cnfn as_short16(half16);
-ushort __ovld __cnfn as_ushort(half);
-ushort2 __ovld __cnfn as_ushort2(half2);
-ushort3 __ovld __cnfn as_ushort3(half3);
-ushort3 __ovld __cnfn as_ushort3(half4);
-ushort4 __ovld __cnfn as_ushort4(half3);
-ushort4 __ovld __cnfn as_ushort4(half4);
-ushort8 __ovld __cnfn as_ushort8(half8);
-ushort16 __ovld __cnfn as_ushort16(half16);
-int __ovld __cnfn as_int(half2);
-int2 __ovld __cnfn as_int2(half3);
-int2 __ovld __cnfn as_int2(half4);
-int3 __ovld __cnfn as_int3(half8);
-int4 __ovld __cnfn as_int4(half8);
-int8 __ovld __cnfn as_int8(half16);
-uint __ovld __cnfn as_uint(half2);
-uint2 __ovld __cnfn as_uint2(half3);
-uint2 __ovld __cnfn as_uint2(half4);
-uint3 __ovld __cnfn as_uint3(half8);
-uint4 __ovld __cnfn as_uint4(half8);
-uint8 __ovld __cnfn as_uint8(half16);
-long __ovld __cnfn as_long(half3);
-long __ovld __cnfn as_long(half4);
-long2 __ovld __cnfn as_long2(half8);
-long3 __ovld __cnfn as_long3(half16);
-long4 __ovld __cnfn as_long4(half16);
-ulong __ovld __cnfn as_ulong(half3);
-ulong __ovld __cnfn as_ulong(half4);
-ulong2 __ovld __cnfn as_ulong2(half8);
-ulong3 __ovld __cnfn as_ulong3(half16);
-ulong4 __ovld __cnfn as_ulong4(half16);
-half __ovld __cnfn as_half(char2);
-half __ovld __cnfn as_half(uchar2);
-half __ovld __cnfn as_half(short);
-half __ovld __cnfn as_half(ushort);
-half __ovld __cnfn as_half(half);
-half2 __ovld __cnfn as_half2(char3);
-half2 __ovld __cnfn as_half2(char4);
-half2 __ovld __cnfn as_half2(uchar3);
-half2 __ovld __cnfn as_half2(uchar4);
-half2 __ovld __cnfn as_half2(short2);
-half2 __ovld __cnfn as_half2(ushort2);
-half2 __ovld __cnfn as_half2(int);
-half2 __ovld __cnfn as_half2(uint);
-half2 __ovld __cnfn as_half2(half2);
-half2 __ovld __cnfn as_half2(float);
-half3 __ovld __cnfn as_half3(char8);
-half3 __ovld __cnfn as_half3(uchar8);
-half3 __ovld __cnfn as_half3(short3);
-half3 __ovld __cnfn as_half3(short4);
-half3 __ovld __cnfn as_half3(ushort3);
-half3 __ovld __cnfn as_half3(ushort4);
-half3 __ovld __cnfn as_half3(int2);
-half3 __ovld __cnfn as_half3(uint2);
-half3 __ovld __cnfn as_half3(long);
-half3 __ovld __cnfn as_half3(ulong);
-half3 __ovld __cnfn as_half3(half3);
-half3 __ovld __cnfn as_half3(half4);
-half3 __ovld __cnfn as_half3(float2);
-half4 __ovld __cnfn as_half4(char8);
-half4 __ovld __cnfn as_half4(uchar8);
-half4 __ovld __cnfn as_half4(short3);
-half4 __ovld __cnfn as_half4(short4);
-half4 __ovld __cnfn as_half4(ushort3);
-half4 __ovld __cnfn as_half4(ushort4);
-half4 __ovld __cnfn as_half4(int2);
-half4 __ovld __cnfn as_half4(uint2);
-half4 __ovld __cnfn as_half4(long);
-half4 __ovld __cnfn as_half4(ulong);
-half4 __ovld __cnfn as_half4(half3);
-half4 __ovld __cnfn as_half4(half4);
-half4 __ovld __cnfn as_half4(float2);
-half8 __ovld __cnfn as_half8(char16);
-half8 __ovld __cnfn as_half8(uchar16);
-half8 __ovld __cnfn as_half8(short8);
-half8 __ovld __cnfn as_half8(ushort8);
-half8 __ovld __cnfn as_half8(int3);
-half8 __ovld __cnfn as_half8(int4);
-half8 __ovld __cnfn as_half8(uint3);
-half8 __ovld __cnfn as_half8(uint4);
-half8 __ovld __cnfn as_half8(long2);
-half8 __ovld __cnfn as_half8(ulong2);
-half8 __ovld __cnfn as_half8(half8);
-half8 __ovld __cnfn as_half8(float3);
-half8 __ovld __cnfn as_half8(float4);
-half16 __ovld __cnfn as_half16(short16);
-half16 __ovld __cnfn as_half16(ushort16);
-half16 __ovld __cnfn as_half16(int8);
-half16 __ovld __cnfn as_half16(uint8);
-half16 __ovld __cnfn as_half16(long3);
-half16 __ovld __cnfn as_half16(long4);
-half16 __ovld __cnfn as_half16(ulong3);
-half16 __ovld __cnfn as_half16(ulong4);
-half16 __ovld __cnfn as_half16(half16);
-half16 __ovld __cnfn as_half16(float8);
-float __ovld __cnfn as_float(half2);
-float2 __ovld __cnfn as_float2(half3);
-float2 __ovld __cnfn as_float2(half4);
-float3 __ovld __cnfn as_float3(half8);
-float4 __ovld __cnfn as_float4(half8);
-float8 __ovld __cnfn as_float8(half16);
-
-#ifdef cl_khr_fp64
-half3 __ovld __cnfn as_half3(double);
-half4 __ovld __cnfn as_half4(double);
-half8 __ovld __cnfn as_half8(double2);
-half16 __ovld __cnfn as_half16(double3);
-half16 __ovld __cnfn as_half16(double4);
-double __ovld __cnfn as_double(half3);
-double __ovld __cnfn as_double(half4);
-double2 __ovld __cnfn as_double2(half8);
-double3 __ovld __cnfn as_double3(half16);
-double4 __ovld __cnfn as_double4(half16);
-#endif //cl_khr_fp64
+#define as_half(x) __builtin_astype((x),   half)
+#define as_half2(x) __builtin_astype((x),  half2)
+#define as_half3(x) __builtin_astype((x),  half3)
+#define as_half4(x) __builtin_astype((x),  half4)
+#define as_half8(x) __builtin_astype((x),  half8)
+#define as_half16(x) __builtin_astype((x), half16)
 #endif //cl_khr_fp16
 
 // OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
@@ -9810,14 +9125,6 @@
 float4 __ovld __cnfn native_cos(float4 x);
 float8 __ovld __cnfn native_cos(float8 x);
 float16 __ovld __cnfn native_cos(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_cos(double x);
-double2 __ovld __cnfn native_cos(double2 x);
-double3 __ovld __cnfn native_cos(double3 x);
-double4 __ovld __cnfn native_cos(double4 x);
-double8 __ovld __cnfn native_cos(double8 x);
-double16 __ovld __cnfn native_cos(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute x / y over an implementation-defined range.
@@ -9829,14 +9136,6 @@
 float4 __ovld __cnfn native_divide(float4 x, float4 y);
 float8 __ovld __cnfn native_divide(float8 x, float8 y);
 float16 __ovld __cnfn native_divide(float16 x, float16 y);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_divide(double x, double y);
-double2 __ovld __cnfn native_divide(double2 x, double2 y);
-double3 __ovld __cnfn native_divide(double3 x, double3 y);
-double4 __ovld __cnfn native_divide(double4 x, double4 y);
-double8 __ovld __cnfn native_divide(double8 x, double8 y);
-double16 __ovld __cnfn native_divide(double16 x, double16 y);
-#endif //cl_khr_fp64
 
 /**
  * Compute the base- e exponential of x over an
@@ -9849,14 +9148,6 @@
 float4 __ovld __cnfn native_exp(float4 x);
 float8 __ovld __cnfn native_exp(float8 x);
 float16 __ovld __cnfn native_exp(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp(double x);
-double2 __ovld __cnfn native_exp(double2 x);
-double3 __ovld __cnfn native_exp(double3 x);
-double4 __ovld __cnfn native_exp(double4 x);
-double8 __ovld __cnfn native_exp(double8 x);
-double16 __ovld __cnfn native_exp(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute the base- 2 exponential of x over an
@@ -9869,14 +9160,6 @@
 float4 __ovld __cnfn native_exp2(float4 x);
 float8 __ovld __cnfn native_exp2(float8 x);
 float16 __ovld __cnfn native_exp2(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp2(double x);
-double2 __ovld __cnfn native_exp2(double2 x);
-double3 __ovld __cnfn native_exp2(double3 x);
-double4 __ovld __cnfn native_exp2(double4 x);
-double8 __ovld __cnfn native_exp2(double8 x);
-double16 __ovld __cnfn native_exp2(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute the base- 10 exponential of x over an
@@ -9889,14 +9172,6 @@
 float4 __ovld __cnfn native_exp10(float4 x);
 float8 __ovld __cnfn native_exp10(float8 x);
 float16 __ovld __cnfn native_exp10(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp10(double x);
-double2 __ovld __cnfn native_exp10(double2 x);
-double3 __ovld __cnfn native_exp10(double3 x);
-double4 __ovld __cnfn native_exp10(double4 x);
-double8 __ovld __cnfn native_exp10(double8 x);
-double16 __ovld __cnfn native_exp10(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute natural logarithm over an implementationdefined
@@ -9909,14 +9184,6 @@
 float4 __ovld __cnfn native_log(float4 x);
 float8 __ovld __cnfn native_log(float8 x);
 float16 __ovld __cnfn native_log(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log(double x);
-double2 __ovld __cnfn native_log(double2 x);
-double3 __ovld __cnfn native_log(double3 x);
-double4 __ovld __cnfn native_log(double4 x);
-double8 __ovld __cnfn native_log(double8 x);
-double16 __ovld __cnfn native_log(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute a base 2 logarithm over an implementationdefined
@@ -9928,14 +9195,6 @@
 float4 __ovld __cnfn native_log2(float4 x);
 float8 __ovld __cnfn native_log2(float8 x);
 float16 __ovld __cnfn native_log2(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log2(double x);
-double2 __ovld __cnfn native_log2(double2 x);
-double3 __ovld __cnfn native_log2(double3 x);
-double4 __ovld __cnfn native_log2(double4 x);
-double8 __ovld __cnfn native_log2(double8 x);
-double16 __ovld __cnfn native_log2(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute a base 10 logarithm over an implementationdefined
@@ -9947,14 +9206,6 @@
 float4 __ovld __cnfn native_log10(float4 x);
 float8 __ovld __cnfn native_log10(float8 x);
 float16 __ovld __cnfn native_log10(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log10(double x);
-double2 __ovld __cnfn native_log10(double2 x);
-double3 __ovld __cnfn native_log10(double3 x);
-double4 __ovld __cnfn native_log10(double4 x);
-double8 __ovld __cnfn native_log10(double8 x);
-double16 __ovld __cnfn native_log10(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute x to the power y, where x is >= 0. The range of
@@ -9967,14 +9218,6 @@
 float4 __ovld __cnfn native_powr(float4 x, float4 y);
 float8 __ovld __cnfn native_powr(float8 x, float8 y);
 float16 __ovld __cnfn native_powr(float16 x, float16 y);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_powr(double x, double y);
-double2 __ovld __cnfn native_powr(double2 x, double2 y);
-double3 __ovld __cnfn native_powr(double3 x, double3 y);
-double4 __ovld __cnfn native_powr(double4 x, double4 y);
-double8 __ovld __cnfn native_powr(double8 x, double8 y);
-double16 __ovld __cnfn native_powr(double16 x, double16 y);
-#endif //cl_khr_fp64
 
 /**
  * Compute reciprocal over an implementation-defined
@@ -9986,14 +9229,6 @@
 float4 __ovld __cnfn native_recip(float4 x);
 float8 __ovld __cnfn native_recip(float8 x);
 float16 __ovld __cnfn native_recip(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_recip(double x);
-double2 __ovld __cnfn native_recip(double2 x);
-double3 __ovld __cnfn native_recip(double3 x);
-double4 __ovld __cnfn native_recip(double4 x);
-double8 __ovld __cnfn native_recip(double8 x);
-double16 __ovld __cnfn native_recip(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute inverse square root over an implementationdefined
@@ -10005,14 +9240,6 @@
 float4 __ovld __cnfn native_rsqrt(float4 x);
 float8 __ovld __cnfn native_rsqrt(float8 x);
 float16 __ovld __cnfn native_rsqrt(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_rsqrt(double x);
-double2 __ovld __cnfn native_rsqrt(double2 x);
-double3 __ovld __cnfn native_rsqrt(double3 x);
-double4 __ovld __cnfn native_rsqrt(double4 x);
-double8 __ovld __cnfn native_rsqrt(double8 x);
-double16 __ovld __cnfn native_rsqrt(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute sine over an implementation-defined range.
@@ -10024,14 +9251,6 @@
 float4 __ovld __cnfn native_sin(float4 x);
 float8 __ovld __cnfn native_sin(float8 x);
 float16 __ovld __cnfn native_sin(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_sin(double x);
-double2 __ovld __cnfn native_sin(double2 x);
-double3 __ovld __cnfn native_sin(double3 x);
-double4 __ovld __cnfn native_sin(double4 x);
-double8 __ovld __cnfn native_sin(double8 x);
-double16 __ovld __cnfn native_sin(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute square root over an implementation-defined
@@ -10043,14 +9262,6 @@
 float4 __ovld __cnfn native_sqrt(float4 x);
 float8 __ovld __cnfn native_sqrt(float8 x);
 float16 __ovld __cnfn native_sqrt(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_sqrt(double x);
-double2 __ovld __cnfn native_sqrt(double2 x);
-double3 __ovld __cnfn native_sqrt(double3 x);
-double4 __ovld __cnfn native_sqrt(double4 x);
-double8 __ovld __cnfn native_sqrt(double8 x);
-double16 __ovld __cnfn native_sqrt(double16 x);
-#endif //cl_khr_fp64
 
 /**
  * Compute tangent over an implementation-defined range.
@@ -10062,14 +9273,6 @@
 float4 __ovld __cnfn native_tan(float4 x);
 float8 __ovld __cnfn native_tan(float8 x);
 float16 __ovld __cnfn native_tan(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_tan(double x);
-double2 __ovld __cnfn native_tan(double2 x);
-double3 __ovld __cnfn native_tan(double3 x);
-double4 __ovld __cnfn native_tan(double4 x);
-double8 __ovld __cnfn native_tan(double8 x);
-double16 __ovld __cnfn native_tan(double16 x);
-#endif //cl_khr_fp64
 
 // OpenCL v1.1 s6.11.3, v1.2 s6.12.3, v2.0 s6.13.3 - Integer Functions
 
@@ -12178,6 +11381,8 @@
  * For each component of a vector type,
  * result[i] = if MSB of c[i] is set ? b[i] : a[i].
  * For a scalar type, result = c ? b : a.
+ * b and a must have the same type.
+ * c must have the same number of elements and bits as a.
  */
 char __ovld __cnfn select(char a, char b, char c);
 uchar __ovld __cnfn select(uchar a, uchar b, char c);
@@ -12191,60 +11396,7 @@
 uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c);
 char16 __ovld __cnfn select(char16 a, char16 b, char16 c);
 uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c);
-short __ovld __cnfn select(short a, short b, char c);
-ushort __ovld __cnfn select(ushort a, ushort b, char c);
-short2 __ovld __cnfn select(short2 a, short2 b, char2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, char2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, char3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, char3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, char4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, char4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, char8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, char8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, char16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, char16 c);
-int __ovld __cnfn select(int a, int b, char c);
-uint __ovld __cnfn select(uint a, uint b, char c);
-int2 __ovld __cnfn select(int2 a, int2 b, char2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, char2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, char3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, char3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, char4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, char4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, char8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, char8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, char16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, char16 c);
-long __ovld __cnfn select(long a, long b, char c);
-ulong __ovld __cnfn select(ulong a, ulong b, char c);
-long2 __ovld __cnfn select(long2 a, long2 b, char2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, char2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, char3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, char3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, char4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, char4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, char8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, char8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, char16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, char16 c);
-float __ovld __cnfn select(float a, float b, char c);
-float2 __ovld __cnfn select(float2 a, float2 b, char2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, char3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, char4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, char8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, char16 c);
-char __ovld __cnfn select(char a, char b, short c);
-uchar __ovld __cnfn select(uchar a, uchar b, short c);
-char2 __ovld __cnfn select(char2 a, char2 b, short2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, short2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, short3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, short3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, short4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, short4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, short8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, short8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, short16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, short16 c);
+
 short __ovld __cnfn select(short a, short b, short c);
 ushort __ovld __cnfn select(ushort a, ushort b, short c);
 short2 __ovld __cnfn select(short2 a, short2 b, short2 c);
@@ -12257,60 +11409,7 @@
 ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c);
 short16 __ovld __cnfn select(short16 a, short16 b, short16 c);
 ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c);
-int __ovld __cnfn select(int a, int b, short c);
-uint __ovld __cnfn select(uint a, uint b, short c);
-int2 __ovld __cnfn select(int2 a, int2 b, short2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, short2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, short3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, short3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, short4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, short4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, short8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, short8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, short16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, short16 c);
-long __ovld __cnfn select(long a, long b, short c);
-ulong __ovld __cnfn select(ulong a, ulong b, short c);
-long2 __ovld __cnfn select(long2 a, long2 b, short2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, short2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, short3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, short3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, short4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, short4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, short8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, short8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, short16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, short16 c);
-float __ovld __cnfn select(float a, float b, short c);
-float2 __ovld __cnfn select(float2 a, float2 b, short2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, short3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, short4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, short8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, short16 c);
-char __ovld __cnfn select(char a, char b, int c);
-uchar __ovld __cnfn select(uchar a, uchar b, int c);
-char2 __ovld __cnfn select(char2 a, char2 b, int2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, int2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, int3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, int3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, int4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, int4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, int8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, int8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, int16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, int16 c);
-short __ovld __cnfn select(short a, short b, int c);
-ushort __ovld __cnfn select(ushort a, ushort b, int c);
-short2 __ovld __cnfn select(short2 a, short2 b, int2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, int2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, int3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, int3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, int4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, int4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, int8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, int8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, int16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, int16 c);
+
 int __ovld __cnfn select(int a, int b, int c);
 uint __ovld __cnfn select(uint a, uint b, int c);
 int2 __ovld __cnfn select(int2 a, int2 b, int2 c);
@@ -12323,60 +11422,13 @@
 uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c);
 int16 __ovld __cnfn select(int16 a, int16 b, int16 c);
 uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c);
-long __ovld __cnfn select(long a, long b, int c);
-ulong __ovld __cnfn select(ulong a, ulong b, int c);
-long2 __ovld __cnfn select(long2 a, long2 b, int2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, int2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, int3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, int3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, int4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, int4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, int8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, int8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, int16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, int16 c);
 float __ovld __cnfn select(float a, float b, int c);
 float2 __ovld __cnfn select(float2 a, float2 b, int2 c);
 float3 __ovld __cnfn select(float3 a, float3 b, int3 c);
 float4 __ovld __cnfn select(float4 a, float4 b, int4 c);
 float8 __ovld __cnfn select(float8 a, float8 b, int8 c);
 float16 __ovld __cnfn select(float16 a, float16 b, int16 c);
-char __ovld __cnfn select(char a, char b, long c);
-uchar __ovld __cnfn select(uchar a, uchar b, long c);
-char2 __ovld __cnfn select(char2 a, char2 b, long2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, long2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, long3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, long3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, long4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, long4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, long8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, long8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, long16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, long16 c);
-short __ovld __cnfn select(short a, short b, long c);
-ushort __ovld __cnfn select(ushort a, ushort b, long c);
-short2 __ovld __cnfn select(short2 a, short2 b, long2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, long2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, long3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, long3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, long4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, long4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, long8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, long8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, long16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, long16 c);
-int __ovld __cnfn select(int a, int b, long c);
-uint __ovld __cnfn select(uint a, uint b, long c);
-int2 __ovld __cnfn select(int2 a, int2 b, long2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, long2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, long3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, long3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, long4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, long4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, long8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, long8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, long16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, long16 c);
+
 long __ovld __cnfn select(long a, long b, long c);
 ulong __ovld __cnfn select(ulong a, ulong b, long c);
 long2 __ovld __cnfn select(long2 a, long2 b, long2 c);
@@ -12389,12 +11441,7 @@
 ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c);
 long16 __ovld __cnfn select(long16 a, long16 b, long16 c);
 ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c);
-float __ovld __cnfn select(float a, float b, long c);
-float2 __ovld __cnfn select(float2 a, float2 b, long2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, long3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, long4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, long8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, long16 c);
+
 char __ovld __cnfn select(char a, char b, uchar c);
 uchar __ovld __cnfn select(uchar a, uchar b, uchar c);
 char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c);
@@ -12407,60 +11454,7 @@
 uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c);
 char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c);
 uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c);
-short __ovld __cnfn select(short a, short b, uchar c);
-ushort __ovld __cnfn select(ushort a, ushort b, uchar c);
-short2 __ovld __cnfn select(short2 a, short2 b, uchar2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uchar2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, uchar3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uchar3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, uchar4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uchar4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, uchar8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uchar8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, uchar16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uchar16 c);
-int __ovld __cnfn select(int a, int b, uchar c);
-uint __ovld __cnfn select(uint a, uint b, uchar c);
-int2 __ovld __cnfn select(int2 a, int2 b, uchar2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, uchar2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, uchar3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, uchar3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, uchar4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, uchar4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, uchar8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, uchar8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, uchar16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, uchar16 c);
-long __ovld __cnfn select(long a, long b, uchar c);
-ulong __ovld __cnfn select(ulong a, ulong b, uchar c);
-long2 __ovld __cnfn select(long2 a, long2 b, uchar2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uchar2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, uchar3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uchar3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, uchar4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uchar4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, uchar8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uchar8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, uchar16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uchar16 c);
-float __ovld __cnfn select(float a, float b, uchar c);
-float2 __ovld __cnfn select(float2 a, float2 b, uchar2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, uchar3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, uchar4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, uchar8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, uchar16 c);
-char __ovld __cnfn select(char a, char b, ushort c);
-uchar __ovld __cnfn select(uchar a, uchar b, ushort c);
-char2 __ovld __cnfn select(char2 a, char2 b, ushort2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ushort2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, ushort3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ushort3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, ushort4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ushort4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, ushort8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ushort8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, ushort16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ushort16 c);
+
 short __ovld __cnfn select(short a, short b, ushort c);
 ushort __ovld __cnfn select(ushort a, ushort b, ushort c);
 short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c);
@@ -12473,60 +11467,7 @@
 ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c);
 short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c);
 ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c);
-int __ovld __cnfn select(int a, int b, ushort c);
-uint __ovld __cnfn select(uint a, uint b, ushort c);
-int2 __ovld __cnfn select(int2 a, int2 b, ushort2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, ushort2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, ushort3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, ushort3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, ushort4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, ushort4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, ushort8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, ushort8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, ushort16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, ushort16 c);
-long __ovld __cnfn select(long a, long b, ushort c);
-ulong __ovld __cnfn select(ulong a, ulong b, ushort c);
-long2 __ovld __cnfn select(long2 a, long2 b, ushort2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ushort2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, ushort3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ushort3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, ushort4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ushort4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, ushort8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ushort8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, ushort16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ushort16 c);
-float __ovld __cnfn select(float a, float b, ushort c);
-float2 __ovld __cnfn select(float2 a, float2 b, ushort2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, ushort3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, ushort4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, ushort8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, ushort16 c);
-char __ovld __cnfn select(char a, char b, uint c);
-uchar __ovld __cnfn select(uchar a, uchar b, uint c);
-char2 __ovld __cnfn select(char2 a, char2 b, uint2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uint2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, uint3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uint3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, uint4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uint4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, uint8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uint8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, uint16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uint16 c);
-short __ovld __cnfn select(short a, short b, uint c);
-ushort __ovld __cnfn select(ushort a, ushort b, uint c);
-short2 __ovld __cnfn select(short2 a, short2 b, uint2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uint2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, uint3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uint3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, uint4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uint4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, uint8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uint8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, uint16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uint16 c);
+
 int __ovld __cnfn select(int a, int b, uint c);
 uint __ovld __cnfn select(uint a, uint b, uint c);
 int2 __ovld __cnfn select(int2 a, int2 b, uint2 c);
@@ -12539,60 +11480,13 @@
 uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c);
 int16 __ovld __cnfn select(int16 a, int16 b, uint16 c);
 uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c);
-long __ovld __cnfn select(long a, long b, uint c);
-ulong __ovld __cnfn select(ulong a, ulong b, uint c);
-long2 __ovld __cnfn select(long2 a, long2 b, uint2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uint2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, uint3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uint3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, uint4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uint4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, uint8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uint8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, uint16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uint16 c);
 float __ovld __cnfn select(float a, float b, uint c);
 float2 __ovld __cnfn select(float2 a, float2 b, uint2 c);
 float3 __ovld __cnfn select(float3 a, float3 b, uint3 c);
 float4 __ovld __cnfn select(float4 a, float4 b, uint4 c);
 float8 __ovld __cnfn select(float8 a, float8 b, uint8 c);
 float16 __ovld __cnfn select(float16 a, float16 b, uint16 c);
-char __ovld __cnfn select(char a, char b, ulong c);
-uchar __ovld __cnfn select(uchar a, uchar b, ulong c);
-char2 __ovld __cnfn select(char2 a, char2 b, ulong2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ulong2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, ulong3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ulong3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, ulong4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ulong4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, ulong8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ulong8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, ulong16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ulong16 c);
-short __ovld __cnfn select(short a, short b, ulong c);
-ushort __ovld __cnfn select(ushort a, ushort b, ulong c);
-short2 __ovld __cnfn select(short2 a, short2 b, ulong2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ulong2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, ulong3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ulong3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, ulong4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ulong4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, ulong8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ulong8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, ulong16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ulong16 c);
-int __ovld __cnfn select(int a, int b, ulong c);
-uint __ovld __cnfn select(uint a, uint b, ulong c);
-int2 __ovld __cnfn select(int2 a, int2 b, ulong2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, ulong2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, ulong3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, ulong3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, ulong4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, ulong4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, ulong8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, ulong8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, ulong16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, ulong16 c);
+
 long __ovld __cnfn select(long a, long b, ulong c);
 ulong __ovld __cnfn select(ulong a, ulong b, ulong c);
 long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c);
@@ -12605,12 +11499,7 @@
 ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c);
 long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c);
 ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c);
-float __ovld __cnfn select(float a, float b, ulong c);
-float2 __ovld __cnfn select(float2 a, float2 b, ulong2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, ulong3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, ulong4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, ulong8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, ulong16 c);
+
 #ifdef cl_khr_fp64
 double __ovld __cnfn select(double a, double b, long c);
 double2 __ovld __cnfn select(double2 a, double2 b, long2 c);
@@ -12651,7 +11540,7 @@
  *
  * vstoren write sizeof (gentypen) bytes given by data to address (p + (offset * n)).
  *
- * The address computed as (p + (offset * n)) must be 
+ * The address computed as (p + (offset * n)) must be
  * 8-bit aligned if gentype is char, uchar;
  * 16-bit aligned if gentype is short, ushort, half;
  * 32-bit aligned if gentype is int, uint, float;
@@ -13934,21 +12823,22 @@
  * image objects and then want to read the updated data.
  */
 
-void __ovld barrier(cl_mem_fence_flags flags);
+void __ovld __conv barrier(cl_mem_fence_flags flags);
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
-typedef enum memory_scope
-{
-  memory_scope_work_item,
-  memory_scope_work_group,
-  memory_scope_device,
-  memory_scope_all_svm_devices,
-  memory_scope_sub_group
+typedef enum memory_scope {
+  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
 } memory_scope;
 
-void __ovld work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
-void __ovld work_group_barrier(cl_mem_fence_flags flags);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
@@ -13972,7 +12862,7 @@
  * Read memory barrier that orders only
  * loads.
  * The flags argument specifies the memory
- * address space and can be set to to a
+ * address space and can be set to a
  * combination of the following literal
  * values:
  * CLK_LOCAL_MEM_FENCE
@@ -13984,7 +12874,7 @@
  * Write memory barrier that orders only
  * stores.
  * The flags argument specifies the memory
- * address space and can be set to to a
+ * address space and can be set to a
  * combination of the following literal
  * values:
  * CLK_LOCAL_MEM_FENCE
@@ -13998,7 +12888,7 @@
 cl_mem_fence_flags __ovld get_fence(const void *ptr);
 cl_mem_fence_flags __ovld get_fence(void *ptr);
 
-/** 
+/**
  * Builtin functions to_global, to_local, and to_private need to be declared as Clang builtin functions
  * and checked in Sema since they should be declared as
  *   addr gentype* to_addr (gentype*);
@@ -14500,10 +13390,10 @@
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_xchg(volatile __global int *p, int val);
-int __ovld atom_xchg(volatile __local int *p, int val);
+unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
+int __ovld atom_xchg(volatile __local int *p, int val);
 unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val);
 #endif
 
@@ -14620,8 +13510,6 @@
 #if defined(cl_khr_int64_extended_atomics)
 long __ovld atom_min(volatile __global long *p, long val);
 unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val);
-#endif
-#if defined(cl_khr_local_int32_extended_atomics)
 long __ovld atom_min(volatile __local long *p, long val);
 unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val);
 #endif
@@ -14728,6 +13616,13 @@
 unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val);
 #endif
 
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_xor(volatile __global long *p, long val);
+unsigned long __ovld atom_xor(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_xor(volatile __local long *p, long val);
+unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long val);
+#endif
+
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
@@ -14744,11 +13639,11 @@
 // enum values aligned with what clang uses in EmitAtomicExpr()
 typedef enum memory_order
 {
-  memory_order_relaxed,
-  memory_order_acquire,
-  memory_order_release,
-  memory_order_acq_rel,
-  memory_order_seq_cst
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
 } memory_order;
 
 // double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
@@ -14878,7 +13773,7 @@
 // add/sub: atomic type argument can be uintptr_t/intptr_t, value type argument can be ptrdiff_t.
 // or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t.
 
-#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) 
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand);
 uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
 uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
@@ -15564,9 +14459,11 @@
 half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
 #endif //cl_khr_fp16
 
+#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
 
 int printf(__constant const char* st, ...);
+#endif
 
 // OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
 
@@ -15592,6 +14489,10 @@
 #define CLK_FILTER_NEAREST              0x10
 #define CLK_FILTER_LINEAR               0x20
 
+#ifdef cl_khr_gl_msaa_sharing
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
+#endif //cl_khr_gl_msaa_sharing
+
 /**
  * Use the coordinate (coord.xy) to do an element lookup in
  * the 2D image object specified by image.
@@ -15670,7 +14571,7 @@
  * only. The filter_mode specified in sampler
  * must be set to CLK_FILTER_NEAREST; otherwise
  * the values returned are undefined.
- 
+
  * The read_image{f|i|ui} calls that take
  * integer coordinates must use a sampler with
  * normalized coordinates set to
@@ -15748,6 +14649,7 @@
 #endif //cl_khr_gl_msaa_sharing
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 
 float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
@@ -15823,6 +14725,7 @@
 uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
 
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 /**
 * Sampler-less Image Access
@@ -15921,6 +14824,7 @@
 float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample);
 #endif //cl_khr_gl_msaa_sharing
 
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
 int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
@@ -15994,6 +14898,7 @@
 int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
 uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // Image read functions returning half4 type
 #ifdef cl_khr_fp16
@@ -16093,9 +14998,11 @@
 void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color);
 void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color);
 
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color);
 void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color);
 void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color);
+#endif
 
 #ifdef cl_khr_depth_images
 void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, float color);
@@ -16103,6 +15010,7 @@
 #endif //cl_khr_depth_images
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
 void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
@@ -16123,16 +15031,21 @@
 void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);
 void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);
 
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);
 void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);
 void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
+#endif
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // Image write functions for half4 type
 #ifdef cl_khr_fp16
 void __ovld write_imageh(write_only image1d_t image, int coord, half4 color);
 void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color);
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color);
+#endif
 void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color);
 void __ovld write_imageh(write_only image2d_array_t image, int4 coord, half4 color);
 void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 color);
@@ -16160,15 +15073,18 @@
 void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color);
 void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color);
 
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color);
 void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color);
 void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color);
+#endif
 
 #ifdef cl_khr_depth_images
 void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float color);
 void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, float color);
 #endif //cl_khr_depth_images
 
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
 void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
@@ -16189,16 +15105,21 @@
 void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color);
 void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color);
 
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);
 void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);
 void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
+#endif
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // Image write functions for half4 type
 #ifdef cl_khr_fp16
 void __ovld write_imageh(read_write image1d_t image, int coord, half4 color);
 void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color);
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imageh(read_write image3d_t image, int4 coord, half4 color);
+#endif
 void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 color);
 void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color);
 void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color);
@@ -16216,7 +15137,9 @@
 int __ovld __cnfn get_image_width(read_only image1d_t image);
 int __ovld __cnfn get_image_width(read_only image1d_buffer_t image);
 int __ovld __cnfn get_image_width(read_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_width(read_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_width(read_only image1d_array_t image);
 int __ovld __cnfn get_image_width(read_only image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -16233,7 +15156,9 @@
 int __ovld __cnfn get_image_width(write_only image1d_t image);
 int __ovld __cnfn get_image_width(write_only image1d_buffer_t image);
 int __ovld __cnfn get_image_width(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_width(write_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_width(write_only image1d_array_t image);
 int __ovld __cnfn get_image_width(write_only image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -16284,7 +15209,9 @@
 #endif //cl_khr_gl_msaa_sharing
 
 int __ovld __cnfn get_image_height(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_height(write_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_height(write_only image2d_array_t image);
 #ifdef cl_khr_depth_images
 int __ovld __cnfn get_image_height(write_only image2d_depth_t image);
@@ -16318,13 +15245,16 @@
  */
 int __ovld __cnfn get_image_depth(read_only image3d_t image);
 
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_depth(write_only image3d_t image);
+#endif
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int __ovld __cnfn get_image_depth(read_write image3d_t image);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 /**
  * Return the image miplevels.
@@ -16336,13 +15266,13 @@
 
 int __ovld get_image_num_mip_levels(write_only image1d_t image);
 int __ovld get_image_num_mip_levels(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld get_image_num_mip_levels(write_only image3d_t image);
+#endif
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int __ovld get_image_num_mip_levels(read_write image1d_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_t image);
 int __ovld get_image_num_mip_levels(read_write image3d_t image);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 int __ovld get_image_num_mip_levels(read_only image1d_array_t image);
 int __ovld get_image_num_mip_levels(read_only image2d_array_t image);
@@ -16354,14 +15284,13 @@
 int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t image);
 int __ovld get_image_num_mip_levels(write_only image2d_depth_t image);
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int __ovld get_image_num_mip_levels(read_write image1d_array_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_array_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 /**
  * Return the channel data type. Valid values are:
@@ -16422,7 +15351,9 @@
 int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image);
 int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image);
 int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image);
 int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -16490,9 +15421,10 @@
 #define CLK_DEPTH_STENCIL     0x10BE
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #define CLK_sRGB              0x10BF
-#define CLK_sRGBA             0x10C1
 #define CLK_sRGBx             0x10C0
+#define CLK_sRGBA             0x10C1
 #define CLK_sBGRA             0x10C2
+#define CLK_ABGR              0x10C3
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
@@ -16515,7 +15447,9 @@
 int __ovld __cnfn get_image_channel_order(write_only image1d_t image);
 int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image);
 int __ovld __cnfn get_image_channel_order(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_order(write_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image);
 int __ovld __cnfn get_image_channel_order(write_only image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -16601,7 +15535,9 @@
  * component and the w component is 0.
  */
 int4 __ovld __cnfn get_image_dim(read_only image3d_t image);
+#ifdef cl_khr_3d_image_writes
 int4 __ovld __cnfn get_image_dim(write_only image3d_t image);
+#endif
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int4 __ovld __cnfn get_image_dim(read_write image3d_t image);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -16670,101 +15606,101 @@
 // OpenCL v2.0 s6.13.15 - Work-group Functions
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-int __ovld work_group_all(int predicate);
-int __ovld work_group_any(int predicate);
+int __ovld __conv work_group_all(int predicate);
+int __ovld __conv work_group_any(int predicate);
 
 #ifdef cl_khr_fp16
-half __ovld work_group_broadcast(half a, size_t local_id);
-half __ovld work_group_broadcast(half a, size_t x, size_t y);
-half __ovld work_group_broadcast(half a, size_t x, size_t y, size_t z);
+half __ovld __conv work_group_broadcast(half a, size_t local_id);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y, size_t z);
 #endif
-int __ovld work_group_broadcast(int a, size_t local_id);
-int __ovld work_group_broadcast(int a, size_t x, size_t y);
-int __ovld work_group_broadcast(int a, size_t x, size_t y, size_t z);
-uint __ovld work_group_broadcast(uint a, size_t local_id);
-uint __ovld work_group_broadcast(uint a, size_t x, size_t y);
-uint __ovld work_group_broadcast(uint a, size_t x, size_t y, size_t z);
-long __ovld work_group_broadcast(long a, size_t local_id);
-long __ovld work_group_broadcast(long a, size_t x, size_t y);
-long __ovld work_group_broadcast(long a, size_t x, size_t y, size_t z);
-ulong __ovld work_group_broadcast(ulong a, size_t local_id);
-ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y);
-ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
-float __ovld work_group_broadcast(float a, size_t local_id);
-float __ovld work_group_broadcast(float a, size_t x, size_t y);
-float __ovld work_group_broadcast(float a, size_t x, size_t y, size_t z);
+int __ovld __conv work_group_broadcast(int a, size_t local_id);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y, size_t z);
+uint __ovld __conv work_group_broadcast(uint a, size_t local_id);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y, size_t z);
+long __ovld __conv work_group_broadcast(long a, size_t local_id);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y, size_t z);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t local_id);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
+float __ovld __conv work_group_broadcast(float a, size_t local_id);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z);
 #ifdef cl_khr_fp64
-double __ovld work_group_broadcast(double a, size_t local_id);
-double __ovld work_group_broadcast(double a, size_t x, size_t y);
-double __ovld work_group_broadcast(double a, size_t x, size_t y, size_t z);
+double __ovld __conv work_group_broadcast(double a, size_t local_id);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y, size_t z);
 #endif //cl_khr_fp64
 
 #ifdef cl_khr_fp16
-half __ovld work_group_reduce_add(half x);
-half __ovld work_group_reduce_min(half x);
-half __ovld work_group_reduce_max(half x);
-half __ovld work_group_scan_exclusive_add(half x);
-half __ovld work_group_scan_exclusive_min(half x);
-half __ovld work_group_scan_exclusive_max(half x);
-half __ovld work_group_scan_inclusive_add(half x);
-half __ovld work_group_scan_inclusive_min(half x);
-half __ovld work_group_scan_inclusive_max(half x);
+half __ovld __conv work_group_reduce_add(half x);
+half __ovld __conv work_group_reduce_min(half x);
+half __ovld __conv work_group_reduce_max(half x);
+half __ovld __conv work_group_scan_exclusive_add(half x);
+half __ovld __conv work_group_scan_exclusive_min(half x);
+half __ovld __conv work_group_scan_exclusive_max(half x);
+half __ovld __conv work_group_scan_inclusive_add(half x);
+half __ovld __conv work_group_scan_inclusive_min(half x);
+half __ovld __conv work_group_scan_inclusive_max(half x);
 #endif
-int __ovld work_group_reduce_add(int x);
-int __ovld work_group_reduce_min(int x);
-int __ovld work_group_reduce_max(int x);
-int __ovld work_group_scan_exclusive_add(int x);
-int __ovld work_group_scan_exclusive_min(int x);
-int __ovld work_group_scan_exclusive_max(int x);
-int __ovld work_group_scan_inclusive_add(int x);
-int __ovld work_group_scan_inclusive_min(int x);
-int __ovld work_group_scan_inclusive_max(int x);
-uint __ovld work_group_reduce_add(uint x);
-uint __ovld work_group_reduce_min(uint x);
-uint __ovld work_group_reduce_max(uint x);
-uint __ovld work_group_scan_exclusive_add(uint x);
-uint __ovld work_group_scan_exclusive_min(uint x);
-uint __ovld work_group_scan_exclusive_max(uint x);
-uint __ovld work_group_scan_inclusive_add(uint x);
-uint __ovld work_group_scan_inclusive_min(uint x);
-uint __ovld work_group_scan_inclusive_max(uint x);
-long __ovld work_group_reduce_add(long x);
-long __ovld work_group_reduce_min(long x);
-long __ovld work_group_reduce_max(long x);
-long __ovld work_group_scan_exclusive_add(long x);
-long __ovld work_group_scan_exclusive_min(long x);
-long __ovld work_group_scan_exclusive_max(long x);
-long __ovld work_group_scan_inclusive_add(long x);
-long __ovld work_group_scan_inclusive_min(long x);
-long __ovld work_group_scan_inclusive_max(long x);
-ulong __ovld work_group_reduce_add(ulong x);
-ulong __ovld work_group_reduce_min(ulong x);
-ulong __ovld work_group_reduce_max(ulong x);
-ulong __ovld work_group_scan_exclusive_add(ulong x);
-ulong __ovld work_group_scan_exclusive_min(ulong x);
-ulong __ovld work_group_scan_exclusive_max(ulong x);
-ulong __ovld work_group_scan_inclusive_add(ulong x);
-ulong __ovld work_group_scan_inclusive_min(ulong x);
-ulong __ovld work_group_scan_inclusive_max(ulong x);
-float __ovld work_group_reduce_add(float x);
-float __ovld work_group_reduce_min(float x);
-float __ovld work_group_reduce_max(float x);
-float __ovld work_group_scan_exclusive_add(float x);
-float __ovld work_group_scan_exclusive_min(float x);
-float __ovld work_group_scan_exclusive_max(float x);
-float __ovld work_group_scan_inclusive_add(float x);
-float __ovld work_group_scan_inclusive_min(float x);
-float __ovld work_group_scan_inclusive_max(float x);
+int __ovld __conv work_group_reduce_add(int x);
+int __ovld __conv work_group_reduce_min(int x);
+int __ovld __conv work_group_reduce_max(int x);
+int __ovld __conv work_group_scan_exclusive_add(int x);
+int __ovld __conv work_group_scan_exclusive_min(int x);
+int __ovld __conv work_group_scan_exclusive_max(int x);
+int __ovld __conv work_group_scan_inclusive_add(int x);
+int __ovld __conv work_group_scan_inclusive_min(int x);
+int __ovld __conv work_group_scan_inclusive_max(int x);
+uint __ovld __conv work_group_reduce_add(uint x);
+uint __ovld __conv work_group_reduce_min(uint x);
+uint __ovld __conv work_group_reduce_max(uint x);
+uint __ovld __conv work_group_scan_exclusive_add(uint x);
+uint __ovld __conv work_group_scan_exclusive_min(uint x);
+uint __ovld __conv work_group_scan_exclusive_max(uint x);
+uint __ovld __conv work_group_scan_inclusive_add(uint x);
+uint __ovld __conv work_group_scan_inclusive_min(uint x);
+uint __ovld __conv work_group_scan_inclusive_max(uint x);
+long __ovld __conv work_group_reduce_add(long x);
+long __ovld __conv work_group_reduce_min(long x);
+long __ovld __conv work_group_reduce_max(long x);
+long __ovld __conv work_group_scan_exclusive_add(long x);
+long __ovld __conv work_group_scan_exclusive_min(long x);
+long __ovld __conv work_group_scan_exclusive_max(long x);
+long __ovld __conv work_group_scan_inclusive_add(long x);
+long __ovld __conv work_group_scan_inclusive_min(long x);
+long __ovld __conv work_group_scan_inclusive_max(long x);
+ulong __ovld __conv work_group_reduce_add(ulong x);
+ulong __ovld __conv work_group_reduce_min(ulong x);
+ulong __ovld __conv work_group_reduce_max(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_max(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_max(ulong x);
+float __ovld __conv work_group_reduce_add(float x);
+float __ovld __conv work_group_reduce_min(float x);
+float __ovld __conv work_group_reduce_max(float x);
+float __ovld __conv work_group_scan_exclusive_add(float x);
+float __ovld __conv work_group_scan_exclusive_min(float x);
+float __ovld __conv work_group_scan_exclusive_max(float x);
+float __ovld __conv work_group_scan_inclusive_add(float x);
+float __ovld __conv work_group_scan_inclusive_min(float x);
+float __ovld __conv work_group_scan_inclusive_max(float x);
 #ifdef cl_khr_fp64
-double __ovld work_group_reduce_add(double x);
-double __ovld work_group_reduce_min(double x);
-double __ovld work_group_reduce_max(double x);
-double __ovld work_group_scan_exclusive_add(double x);
-double __ovld work_group_scan_exclusive_min(double x);
-double __ovld work_group_scan_exclusive_max(double x);
-double __ovld work_group_scan_inclusive_add(double x);
-double __ovld work_group_scan_inclusive_min(double x);
-double __ovld work_group_scan_inclusive_max(double x);
+double __ovld __conv work_group_reduce_add(double x);
+double __ovld __conv work_group_reduce_min(double x);
+double __ovld __conv work_group_reduce_max(double x);
+double __ovld __conv work_group_scan_exclusive_add(double x);
+double __ovld __conv work_group_scan_exclusive_min(double x);
+double __ovld __conv work_group_scan_exclusive_max(double x);
+double __ovld __conv work_group_scan_inclusive_add(double x);
+double __ovld __conv work_group_scan_inclusive_min(double x);
+double __ovld __conv work_group_scan_inclusive_max(double x);
 #endif //cl_khr_fp64
 
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -16811,16 +15747,12 @@
 
 #define MAX_WORK_DIM        3
 
-// ToDo: Remove definition of ndrange_t in Clang as an opaque type and add back
-// the following ndrange_t definition.
-#if 0
 typedef struct {
     unsigned int workDimension;
     size_t globalWorkOffset[MAX_WORK_DIM];
     size_t globalWorkSize[MAX_WORK_DIM];
     size_t localWorkSize[MAX_WORK_DIM];
 } ndrange_t;
-#endif
 
 ndrange_t __ovld ndrange_1D(size_t);
 ndrange_t __ovld ndrange_1D(size_t, size_t);
@@ -16840,11 +15772,11 @@
 
 void __ovld release_event(clk_event_t);
 
-clk_event_t create_user_event(void);
+clk_event_t __ovld create_user_event(void);
 
 void __ovld set_user_event_status(clk_event_t e, int state);
 
-bool is_valid_event (clk_event_t event);
+bool __ovld is_valid_event (clk_event_t event);
 
 void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);
 
@@ -16864,96 +15796,593 @@
 uint    __ovld get_sub_group_id(void);
 uint    __ovld get_sub_group_local_id(void);
 
-void    __ovld sub_group_barrier(cl_mem_fence_flags flags);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-void    __ovld sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
-int     __ovld sub_group_all(int predicate);
-int     __ovld sub_group_any(int predicate);
+int     __ovld __conv sub_group_all(int predicate);
+int     __ovld __conv sub_group_any(int predicate);
 
-int     __ovld sub_group_broadcast(int   x, uint sub_group_local_id);
-uint    __ovld sub_group_broadcast(uint  x, uint sub_group_local_id);
-long    __ovld sub_group_broadcast(long  x, uint sub_group_local_id);
-ulong   __ovld sub_group_broadcast(ulong x, uint sub_group_local_id);
-float   __ovld sub_group_broadcast(float x, uint sub_group_local_id);
+int     __ovld __conv sub_group_broadcast(int   x, uint sub_group_local_id);
+uint    __ovld __conv sub_group_broadcast(uint  x, uint sub_group_local_id);
+long    __ovld __conv sub_group_broadcast(long  x, uint sub_group_local_id);
+ulong   __ovld __conv sub_group_broadcast(ulong x, uint sub_group_local_id);
+float   __ovld __conv sub_group_broadcast(float x, uint sub_group_local_id);
 
-int     __ovld sub_group_reduce_add(int   x);
-uint    __ovld sub_group_reduce_add(uint  x);
-long    __ovld sub_group_reduce_add(long  x);
-ulong   __ovld sub_group_reduce_add(ulong x);
-float   __ovld sub_group_reduce_add(float x);
-int     __ovld sub_group_reduce_min(int   x);
-uint    __ovld sub_group_reduce_min(uint  x);
-long    __ovld sub_group_reduce_min(long  x);
-ulong   __ovld sub_group_reduce_min(ulong x);
-float   __ovld sub_group_reduce_min(float x);
-int     __ovld sub_group_reduce_max(int   x);
-uint    __ovld sub_group_reduce_max(uint  x);
-long    __ovld sub_group_reduce_max(long  x);
-ulong   __ovld sub_group_reduce_max(ulong x);
-float   __ovld sub_group_reduce_max(float x);
+int     __ovld __conv sub_group_reduce_add(int   x);
+uint    __ovld __conv sub_group_reduce_add(uint  x);
+long    __ovld __conv sub_group_reduce_add(long  x);
+ulong   __ovld __conv sub_group_reduce_add(ulong x);
+float   __ovld __conv sub_group_reduce_add(float x);
+int     __ovld __conv sub_group_reduce_min(int   x);
+uint    __ovld __conv sub_group_reduce_min(uint  x);
+long    __ovld __conv sub_group_reduce_min(long  x);
+ulong   __ovld __conv sub_group_reduce_min(ulong x);
+float   __ovld __conv sub_group_reduce_min(float x);
+int     __ovld __conv sub_group_reduce_max(int   x);
+uint    __ovld __conv sub_group_reduce_max(uint  x);
+long    __ovld __conv sub_group_reduce_max(long  x);
+ulong   __ovld __conv sub_group_reduce_max(ulong x);
+float   __ovld __conv sub_group_reduce_max(float x);
 
-int     __ovld sub_group_scan_exclusive_add(int   x);
-uint    __ovld sub_group_scan_exclusive_add(uint  x);
-long    __ovld sub_group_scan_exclusive_add(long  x);
-ulong   __ovld sub_group_scan_exclusive_add(ulong x);
-float   __ovld sub_group_scan_exclusive_add(float x);
-int     __ovld sub_group_scan_exclusive_min(int   x);
-uint    __ovld sub_group_scan_exclusive_min(uint  x);
-long    __ovld sub_group_scan_exclusive_min(long  x);
-ulong   __ovld sub_group_scan_exclusive_min(ulong x);
-float   __ovld sub_group_scan_exclusive_min(float x);
-int     __ovld sub_group_scan_exclusive_max(int   x);
-uint    __ovld sub_group_scan_exclusive_max(uint  x);
-long    __ovld sub_group_scan_exclusive_max(long  x);
-ulong   __ovld sub_group_scan_exclusive_max(ulong x);
-float   __ovld sub_group_scan_exclusive_max(float x);
+int     __ovld __conv sub_group_scan_exclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_add(float x);
+int     __ovld __conv sub_group_scan_exclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_min(float x);
+int     __ovld __conv sub_group_scan_exclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_max(float x);
 
-int     __ovld sub_group_scan_inclusive_add(int   x);
-uint    __ovld sub_group_scan_inclusive_add(uint  x);
-long    __ovld sub_group_scan_inclusive_add(long  x);
-ulong   __ovld sub_group_scan_inclusive_add(ulong x);
-float   __ovld sub_group_scan_inclusive_add(float x);
-int     __ovld sub_group_scan_inclusive_min(int   x);
-uint    __ovld sub_group_scan_inclusive_min(uint  x);
-long    __ovld sub_group_scan_inclusive_min(long  x);
-ulong   __ovld sub_group_scan_inclusive_min(ulong x);
-float   __ovld sub_group_scan_inclusive_min(float x);
-int     __ovld sub_group_scan_inclusive_max(int   x);
-uint    __ovld sub_group_scan_inclusive_max(uint  x);
-long    __ovld sub_group_scan_inclusive_max(long  x);
-ulong   __ovld sub_group_scan_inclusive_max(ulong x);
-float   __ovld sub_group_scan_inclusive_max(float x);
+int     __ovld __conv sub_group_scan_inclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_add(float x);
+int     __ovld __conv sub_group_scan_inclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_min(float x);
+int     __ovld __conv sub_group_scan_inclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_max(float x);
 
 #ifdef cl_khr_fp16
-half    __ovld sub_group_broadcast(half x, uint sub_group_local_id);
-half    __ovld sub_group_reduce_add(half x);
-half    __ovld sub_group_reduce_min(half x);
-half    __ovld sub_group_reduce_max(half x);
-half    __ovld sub_group_scan_exclusive_add(half x);
-half    __ovld sub_group_scan_exclusive_min(half x);
-half    __ovld sub_group_scan_exclusive_max(half x);
-half    __ovld sub_group_scan_inclusive_add(half x);
-half    __ovld sub_group_scan_inclusive_min(half x);
-half    __ovld sub_group_scan_inclusive_max(half x);
+half    __ovld __conv sub_group_broadcast(half x, uint sub_group_local_id);
+half    __ovld __conv sub_group_reduce_add(half x);
+half    __ovld __conv sub_group_reduce_min(half x);
+half    __ovld __conv sub_group_reduce_max(half x);
+half    __ovld __conv sub_group_scan_exclusive_add(half x);
+half    __ovld __conv sub_group_scan_exclusive_min(half x);
+half    __ovld __conv sub_group_scan_exclusive_max(half x);
+half    __ovld __conv sub_group_scan_inclusive_add(half x);
+half    __ovld __conv sub_group_scan_inclusive_min(half x);
+half    __ovld __conv sub_group_scan_inclusive_max(half x);
 #endif //cl_khr_fp16
 
 #ifdef cl_khr_fp64
-double  __ovld sub_group_broadcast(double x, uint sub_group_local_id);
-double  __ovld sub_group_reduce_add(double x);
-double  __ovld sub_group_reduce_min(double x);
-double  __ovld sub_group_reduce_max(double x);
-double  __ovld sub_group_scan_exclusive_add(double x);
-double  __ovld sub_group_scan_exclusive_min(double x);
-double  __ovld sub_group_scan_exclusive_max(double x);
-double  __ovld sub_group_scan_inclusive_add(double x);
-double  __ovld sub_group_scan_inclusive_min(double x);
-double  __ovld sub_group_scan_inclusive_max(double x);
+double  __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id);
+double  __ovld __conv sub_group_reduce_add(double x);
+double  __ovld __conv sub_group_reduce_min(double x);
+double  __ovld __conv sub_group_reduce_max(double x);
+double  __ovld __conv sub_group_scan_exclusive_add(double x);
+double  __ovld __conv sub_group_scan_exclusive_min(double x);
+double  __ovld __conv sub_group_scan_exclusive_max(double x);
+double  __ovld __conv sub_group_scan_inclusive_add(double x);
+double  __ovld __conv sub_group_scan_inclusive_min(double x);
+double  __ovld __conv sub_group_scan_inclusive_max(double x);
 #endif //cl_khr_fp64
 
 #endif //cl_khr_subgroups cl_intel_subgroups
 
+#if defined(cl_intel_subgroups)
+// Intel-Specific Sub Group Functions
+float   __ovld __conv intel_sub_group_shuffle( float  x, uint c );
+float2  __ovld __conv intel_sub_group_shuffle( float2 x, uint c );
+float3  __ovld __conv intel_sub_group_shuffle( float3 x, uint c );
+float4  __ovld __conv intel_sub_group_shuffle( float4 x, uint c );
+float8  __ovld __conv intel_sub_group_shuffle( float8 x, uint c );
+float16 __ovld __conv intel_sub_group_shuffle( float16 x, uint c );
+
+int     __ovld __conv intel_sub_group_shuffle( int  x, uint c );
+int2    __ovld __conv intel_sub_group_shuffle( int2 x, uint c );
+int3    __ovld __conv intel_sub_group_shuffle( int3 x, uint c );
+int4    __ovld __conv intel_sub_group_shuffle( int4 x, uint c );
+int8    __ovld __conv intel_sub_group_shuffle( int8 x, uint c );
+int16   __ovld __conv intel_sub_group_shuffle( int16 x, uint c );
+
+uint    __ovld __conv intel_sub_group_shuffle( uint  x, uint c );
+uint2   __ovld __conv intel_sub_group_shuffle( uint2 x, uint c );
+uint3   __ovld __conv intel_sub_group_shuffle( uint3 x, uint c );
+uint4   __ovld __conv intel_sub_group_shuffle( uint4 x, uint c );
+uint8   __ovld __conv intel_sub_group_shuffle( uint8 x, uint c );
+uint16  __ovld __conv intel_sub_group_shuffle( uint16 x, uint c );
+
+long    __ovld __conv intel_sub_group_shuffle( long x, uint c );
+ulong   __ovld __conv intel_sub_group_shuffle( ulong x, uint c );
+
+float   __ovld __conv intel_sub_group_shuffle_down( float  cur, float  next, uint c );
+float2  __ovld __conv intel_sub_group_shuffle_down( float2 cur, float2 next, uint c );
+float3  __ovld __conv intel_sub_group_shuffle_down( float3 cur, float3 next, uint c );
+float4  __ovld __conv intel_sub_group_shuffle_down( float4 cur, float4 next, uint c );
+float8  __ovld __conv intel_sub_group_shuffle_down( float8 cur, float8 next, uint c );
+float16 __ovld __conv intel_sub_group_shuffle_down( float16 cur, float16 next, uint c );
+
+int     __ovld __conv intel_sub_group_shuffle_down( int  cur, int  next, uint c );
+int2    __ovld __conv intel_sub_group_shuffle_down( int2 cur, int2 next, uint c );
+int3    __ovld __conv intel_sub_group_shuffle_down( int3 cur, int3 next, uint c );
+int4    __ovld __conv intel_sub_group_shuffle_down( int4 cur, int4 next, uint c );
+int8    __ovld __conv intel_sub_group_shuffle_down( int8 cur, int8 next, uint c );
+int16   __ovld __conv intel_sub_group_shuffle_down( int16 cur, int16 next, uint c );
+
+uint    __ovld __conv intel_sub_group_shuffle_down( uint  cur, uint  next, uint c );
+uint2   __ovld __conv intel_sub_group_shuffle_down( uint2 cur, uint2 next, uint c );
+uint3   __ovld __conv intel_sub_group_shuffle_down( uint3 cur, uint3 next, uint c );
+uint4   __ovld __conv intel_sub_group_shuffle_down( uint4 cur, uint4 next, uint c );
+uint8   __ovld __conv intel_sub_group_shuffle_down( uint8 cur, uint8 next, uint c );
+uint16  __ovld __conv intel_sub_group_shuffle_down( uint16 cur, uint16 next, uint c );
+
+long    __ovld __conv intel_sub_group_shuffle_down( long prev, long cur, uint c );
+ulong   __ovld __conv intel_sub_group_shuffle_down( ulong prev, ulong cur, uint c );
+
+float   __ovld __conv intel_sub_group_shuffle_up( float  prev, float  cur, uint c );
+float2  __ovld __conv intel_sub_group_shuffle_up( float2 prev, float2 cur, uint c );
+float3  __ovld __conv intel_sub_group_shuffle_up( float3 prev, float3 cur, uint c );
+float4  __ovld __conv intel_sub_group_shuffle_up( float4 prev, float4 cur, uint c );
+float8  __ovld __conv intel_sub_group_shuffle_up( float8 prev, float8 cur, uint c );
+float16 __ovld __conv intel_sub_group_shuffle_up( float16 prev, float16 cur, uint c );
+
+int     __ovld __conv intel_sub_group_shuffle_up( int  prev, int  cur, uint c );
+int2    __ovld __conv intel_sub_group_shuffle_up( int2 prev, int2 cur, uint c );
+int3    __ovld __conv intel_sub_group_shuffle_up( int3 prev, int3 cur, uint c );
+int4    __ovld __conv intel_sub_group_shuffle_up( int4 prev, int4 cur, uint c );
+int8    __ovld __conv intel_sub_group_shuffle_up( int8 prev, int8 cur, uint c );
+int16   __ovld __conv intel_sub_group_shuffle_up( int16 prev, int16 cur, uint c );
+
+uint    __ovld __conv intel_sub_group_shuffle_up( uint  prev, uint  cur, uint c );
+uint2   __ovld __conv intel_sub_group_shuffle_up( uint2 prev, uint2 cur, uint c );
+uint3   __ovld __conv intel_sub_group_shuffle_up( uint3 prev, uint3 cur, uint c );
+uint4   __ovld __conv intel_sub_group_shuffle_up( uint4 prev, uint4 cur, uint c );
+uint8   __ovld __conv intel_sub_group_shuffle_up( uint8 prev, uint8 cur, uint c );
+uint16  __ovld __conv intel_sub_group_shuffle_up( uint16 prev, uint16 cur, uint c );
+
+long    __ovld __conv intel_sub_group_shuffle_up( long prev, long cur, uint c );
+ulong   __ovld __conv intel_sub_group_shuffle_up( ulong prev, ulong cur, uint c );
+
+float   __ovld __conv intel_sub_group_shuffle_xor( float  x, uint c );
+float2  __ovld __conv intel_sub_group_shuffle_xor( float2 x, uint c );
+float3  __ovld __conv intel_sub_group_shuffle_xor( float3 x, uint c );
+float4  __ovld __conv intel_sub_group_shuffle_xor( float4 x, uint c );
+float8  __ovld __conv intel_sub_group_shuffle_xor( float8 x, uint c );
+float16 __ovld __conv intel_sub_group_shuffle_xor( float16 x, uint c );
+
+int     __ovld __conv intel_sub_group_shuffle_xor( int  x, uint c );
+int2    __ovld __conv intel_sub_group_shuffle_xor( int2 x, uint c );
+int3    __ovld __conv intel_sub_group_shuffle_xor( int3 x, uint c );
+int4    __ovld __conv intel_sub_group_shuffle_xor( int4 x, uint c );
+int8    __ovld __conv intel_sub_group_shuffle_xor( int8 x, uint c );
+int16   __ovld __conv intel_sub_group_shuffle_xor( int16 x, uint c );
+
+uint    __ovld __conv intel_sub_group_shuffle_xor( uint  x, uint c );
+uint2   __ovld __conv intel_sub_group_shuffle_xor( uint2 x, uint c );
+uint3   __ovld __conv intel_sub_group_shuffle_xor( uint3 x, uint c );
+uint4   __ovld __conv intel_sub_group_shuffle_xor( uint4 x, uint c );
+uint8   __ovld __conv intel_sub_group_shuffle_xor( uint8 x, uint c );
+uint16  __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c );
+
+long    __ovld __conv intel_sub_group_shuffle_xor( long x, uint c );
+ulong   __ovld __conv intel_sub_group_shuffle_xor( ulong x, uint c );
+
+uint    __ovld __conv intel_sub_group_block_read( read_only image2d_t image, int2 coord );
+uint2   __ovld __conv intel_sub_group_block_read2( read_only image2d_t image, int2 coord );
+uint4   __ovld __conv intel_sub_group_block_read4( read_only image2d_t image, int2 coord );
+uint8   __ovld __conv intel_sub_group_block_read8( read_only image2d_t image, int2 coord );
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+uint    __ovld __conv intel_sub_group_block_read(read_write image2d_t image, int2 coord);
+uint2   __ovld __conv intel_sub_group_block_read2(read_write image2d_t image, int2 coord);
+uint4   __ovld __conv intel_sub_group_block_read4(read_write image2d_t image, int2 coord);
+uint8   __ovld __conv intel_sub_group_block_read8(read_write image2d_t image, int2 coord);
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+uint    __ovld __conv intel_sub_group_block_read( const __global uint* p );
+uint2   __ovld __conv intel_sub_group_block_read2( const __global uint* p );
+uint4   __ovld __conv intel_sub_group_block_read4( const __global uint* p );
+uint8   __ovld __conv intel_sub_group_block_read8( const __global uint* p );
+
+void    __ovld __conv intel_sub_group_block_write(write_only image2d_t image, int2 coord, uint data);
+void    __ovld __conv intel_sub_group_block_write2(write_only image2d_t image, int2 coord, uint2 data);
+void    __ovld __conv intel_sub_group_block_write4(write_only image2d_t image, int2 coord, uint4 data);
+void    __ovld __conv intel_sub_group_block_write8(write_only image2d_t image, int2 coord, uint8 data);
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+void    __ovld __conv intel_sub_group_block_write(read_write image2d_t image, int2 coord, uint data);
+void    __ovld __conv intel_sub_group_block_write2(read_write image2d_t image, int2 coord, uint2 data);
+void    __ovld __conv intel_sub_group_block_write4(read_write image2d_t image, int2 coord, uint4 data);
+void    __ovld __conv intel_sub_group_block_write8(read_write image2d_t image, int2 coord, uint8 data);
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+void    __ovld __conv intel_sub_group_block_write( __global uint* p, uint data );
+void    __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data );
+void    __ovld __conv intel_sub_group_block_write4( __global uint* p, uint4 data );
+void    __ovld __conv intel_sub_group_block_write8( __global uint* p, uint8 data );
+
+#ifdef cl_khr_fp16
+half    __ovld __conv intel_sub_group_shuffle( half x, uint c );
+half    __ovld __conv intel_sub_group_shuffle_down( half prev, half cur, uint c );
+half    __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c );
+half    __ovld __conv intel_sub_group_shuffle_xor( half x, uint c );
+#endif
+
+#if defined(cl_khr_fp64)
+double  __ovld __conv intel_sub_group_shuffle( double x, uint c );
+double  __ovld __conv intel_sub_group_shuffle_down( double prev, double cur, uint c );
+double  __ovld __conv intel_sub_group_shuffle_up( double prev, double cur, uint c );
+double  __ovld __conv intel_sub_group_shuffle_xor( double x, uint c );
+#endif
+
+#endif //cl_intel_subgroups
+
+#if defined(cl_intel_subgroups_short)
+short       __ovld __conv intel_sub_group_broadcast( short  x, uint sub_group_local_id );
+short2      __ovld __conv intel_sub_group_broadcast( short2 x, uint sub_group_local_id );
+short3      __ovld __conv intel_sub_group_broadcast( short3 x, uint sub_group_local_id );
+short4      __ovld __conv intel_sub_group_broadcast( short4 x, uint sub_group_local_id );
+short8      __ovld __conv intel_sub_group_broadcast( short8 x, uint sub_group_local_id );
+
+ushort      __ovld __conv intel_sub_group_broadcast( ushort  x, uint sub_group_local_id );
+ushort2     __ovld __conv intel_sub_group_broadcast( ushort2 x, uint sub_group_local_id );
+ushort3     __ovld __conv intel_sub_group_broadcast( ushort3 x, uint sub_group_local_id );
+ushort4     __ovld __conv intel_sub_group_broadcast( ushort4 x, uint sub_group_local_id );
+ushort8     __ovld __conv intel_sub_group_broadcast( ushort8 x, uint sub_group_local_id );
+
+short       __ovld __conv intel_sub_group_shuffle( short   x, uint c );
+short2      __ovld __conv intel_sub_group_shuffle( short2  x, uint c );
+short3      __ovld __conv intel_sub_group_shuffle( short3  x, uint c );
+short4      __ovld __conv intel_sub_group_shuffle( short4  x, uint c );
+short8      __ovld __conv intel_sub_group_shuffle( short8  x, uint c );
+short16     __ovld __conv intel_sub_group_shuffle( short16 x, uint c);
+
+ushort      __ovld __conv intel_sub_group_shuffle( ushort   x, uint c );
+ushort2     __ovld __conv intel_sub_group_shuffle( ushort2  x, uint c );
+ushort3     __ovld __conv intel_sub_group_shuffle( ushort3  x, uint c );
+ushort4     __ovld __conv intel_sub_group_shuffle( ushort4  x, uint c );
+ushort8     __ovld __conv intel_sub_group_shuffle( ushort8  x, uint c );
+ushort16    __ovld __conv intel_sub_group_shuffle( ushort16 x, uint c );
+
+short       __ovld __conv intel_sub_group_shuffle_down( short   cur, short   next, uint c );
+short2      __ovld __conv intel_sub_group_shuffle_down( short2  cur, short2  next, uint c );
+short3      __ovld __conv intel_sub_group_shuffle_down( short3  cur, short3  next, uint c );
+short4      __ovld __conv intel_sub_group_shuffle_down( short4  cur, short4  next, uint c );
+short8      __ovld __conv intel_sub_group_shuffle_down( short8  cur, short8  next, uint c );
+short16     __ovld __conv intel_sub_group_shuffle_down( short16 cur, short16 next, uint c );
+
+ushort      __ovld __conv intel_sub_group_shuffle_down( ushort   cur, ushort   next, uint c );
+ushort2     __ovld __conv intel_sub_group_shuffle_down( ushort2  cur, ushort2  next, uint c );
+ushort3     __ovld __conv intel_sub_group_shuffle_down( ushort3  cur, ushort3  next, uint c );
+ushort4     __ovld __conv intel_sub_group_shuffle_down( ushort4  cur, ushort4  next, uint c );
+ushort8     __ovld __conv intel_sub_group_shuffle_down( ushort8  cur, ushort8  next, uint c );
+ushort16    __ovld __conv intel_sub_group_shuffle_down( ushort16 cur, ushort16 next, uint c );
+
+short       __ovld __conv intel_sub_group_shuffle_up( short   cur, short   next, uint c );
+short2      __ovld __conv intel_sub_group_shuffle_up( short2  cur, short2  next, uint c );
+short3      __ovld __conv intel_sub_group_shuffle_up( short3  cur, short3  next, uint c );
+short4      __ovld __conv intel_sub_group_shuffle_up( short4  cur, short4  next, uint c );
+short8      __ovld __conv intel_sub_group_shuffle_up( short8  cur, short8  next, uint c );
+short16     __ovld __conv intel_sub_group_shuffle_up( short16 cur, short16 next, uint c );
+
+ushort      __ovld __conv intel_sub_group_shuffle_up( ushort   cur, ushort   next, uint c );
+ushort2     __ovld __conv intel_sub_group_shuffle_up( ushort2  cur, ushort2  next, uint c );
+ushort3     __ovld __conv intel_sub_group_shuffle_up( ushort3  cur, ushort3  next, uint c );
+ushort4     __ovld __conv intel_sub_group_shuffle_up( ushort4  cur, ushort4  next, uint c );
+ushort8     __ovld __conv intel_sub_group_shuffle_up( ushort8  cur, ushort8  next, uint c );
+ushort16    __ovld __conv intel_sub_group_shuffle_up( ushort16 cur, ushort16 next, uint c );
+
+short       __ovld __conv intel_sub_group_shuffle_xor( short   x, uint c );
+short2      __ovld __conv intel_sub_group_shuffle_xor( short2  x, uint c );
+short3      __ovld __conv intel_sub_group_shuffle_xor( short3  x, uint c );
+short4      __ovld __conv intel_sub_group_shuffle_xor( short4  x, uint c );
+short8      __ovld __conv intel_sub_group_shuffle_xor( short8  x, uint c );
+short16     __ovld __conv intel_sub_group_shuffle_xor( short16 x, uint c );
+
+ushort      __ovld __conv intel_sub_group_shuffle_xor( ushort   x, uint c );
+ushort2     __ovld __conv intel_sub_group_shuffle_xor( ushort2  x, uint c );
+ushort3     __ovld __conv intel_sub_group_shuffle_xor( ushort3  x, uint c );
+ushort4     __ovld __conv intel_sub_group_shuffle_xor( ushort4  x, uint c );
+ushort8     __ovld __conv intel_sub_group_shuffle_xor( ushort8  x, uint c );
+ushort16    __ovld __conv intel_sub_group_shuffle_xor( ushort16 x, uint c );
+
+short       __ovld __conv intel_sub_group_reduce_add( short   x );
+ushort      __ovld __conv intel_sub_group_reduce_add( ushort  x );
+short       __ovld __conv intel_sub_group_reduce_min( short   x );
+ushort      __ovld __conv intel_sub_group_reduce_min( ushort  x );
+short       __ovld __conv intel_sub_group_reduce_max( short   x );
+ushort      __ovld __conv intel_sub_group_reduce_max( ushort  x );
+
+short       __ovld __conv intel_sub_group_scan_exclusive_add( short   x );
+ushort      __ovld __conv intel_sub_group_scan_exclusive_add( ushort  x );
+short       __ovld __conv intel_sub_group_scan_exclusive_min( short   x );
+ushort      __ovld __conv intel_sub_group_scan_exclusive_min( ushort  x );
+short       __ovld __conv intel_sub_group_scan_exclusive_max( short   x );
+ushort      __ovld __conv intel_sub_group_scan_exclusive_max( ushort  x );
+
+short       __ovld __conv intel_sub_group_scan_inclusive_add( short   x );
+ushort      __ovld __conv intel_sub_group_scan_inclusive_add( ushort  x );
+short       __ovld __conv intel_sub_group_scan_inclusive_min( short   x );
+ushort      __ovld __conv intel_sub_group_scan_inclusive_min( ushort  x );
+short       __ovld __conv intel_sub_group_scan_inclusive_max( short   x );
+ushort      __ovld __conv intel_sub_group_scan_inclusive_max( ushort  x );
+
+uint       __ovld __conv intel_sub_group_block_read_ui( read_only image2d_t image, int2 byte_coord );
+uint2      __ovld __conv intel_sub_group_block_read_ui2( read_only image2d_t image, int2 byte_coord );
+uint4      __ovld __conv intel_sub_group_block_read_ui4( read_only image2d_t image, int2 byte_coord );
+uint8      __ovld __conv intel_sub_group_block_read_ui8( read_only image2d_t image, int2 byte_coord );
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+uint       __ovld __conv intel_sub_group_block_read_ui( read_write image2d_t image, int2 byte_coord );
+uint2      __ovld __conv intel_sub_group_block_read_ui2( read_write image2d_t image, int2 byte_coord );
+uint4      __ovld __conv intel_sub_group_block_read_ui4( read_write image2d_t image, int2 byte_coord );
+uint8      __ovld __conv intel_sub_group_block_read_ui8( read_write image2d_t image, int2 byte_coord );
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+uint       __ovld __conv intel_sub_group_block_read_ui( const __global uint* p );
+uint2      __ovld __conv intel_sub_group_block_read_ui2( const __global uint* p );
+uint4      __ovld __conv intel_sub_group_block_read_ui4( const __global uint* p );
+uint8      __ovld __conv intel_sub_group_block_read_ui8( const __global uint* p );
+
+void       __ovld __conv intel_sub_group_block_write_ui( read_only image2d_t image, int2 byte_coord, uint data );
+void       __ovld __conv intel_sub_group_block_write_ui2( read_only image2d_t image, int2 byte_coord, uint2 data );
+void       __ovld __conv intel_sub_group_block_write_ui4( read_only image2d_t image, int2 byte_coord, uint4 data );
+void       __ovld __conv intel_sub_group_block_write_ui8( read_only image2d_t image, int2 byte_coord, uint8 data );
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+void       __ovld __conv intel_sub_group_block_write_ui( read_write image2d_t image, int2 byte_coord, uint data );
+void       __ovld __conv intel_sub_group_block_write_ui2( read_write image2d_t image, int2 byte_coord, uint2 data );
+void       __ovld __conv intel_sub_group_block_write_ui4( read_write image2d_t image, int2 byte_coord, uint4 data );
+void       __ovld __conv intel_sub_group_block_write_ui8( read_write image2d_t image, int2 byte_coord, uint8 data );
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+void       __ovld __conv intel_sub_group_block_write_ui( __global uint* p, uint data );
+void       __ovld __conv intel_sub_group_block_write_ui2( __global uint* p, uint2 data );
+void       __ovld __conv intel_sub_group_block_write_ui4( __global uint* p, uint4 data );
+void       __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint8 data );
+
+ushort      __ovld __conv intel_sub_group_block_read_us( read_only image2d_t image, int2 coord );
+ushort2     __ovld __conv intel_sub_group_block_read_us2( read_only image2d_t image, int2 coord );
+ushort4     __ovld __conv intel_sub_group_block_read_us4( read_only image2d_t image, int2 coord );
+ushort8     __ovld __conv intel_sub_group_block_read_us8( read_only image2d_t image, int2 coord );
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+ushort      __ovld __conv intel_sub_group_block_read_us(read_write image2d_t image, int2 coord);
+ushort2     __ovld __conv intel_sub_group_block_read_us2(read_write image2d_t image, int2 coord);
+ushort4     __ovld __conv intel_sub_group_block_read_us4(read_write image2d_t image, int2 coord);
+ushort8     __ovld __conv intel_sub_group_block_read_us8(read_write image2d_t image, int2 coord);
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+ushort      __ovld __conv intel_sub_group_block_read_us(  const __global ushort* p );
+ushort2     __ovld __conv intel_sub_group_block_read_us2( const __global ushort* p );
+ushort4     __ovld __conv intel_sub_group_block_read_us4( const __global ushort* p );
+ushort8     __ovld __conv intel_sub_group_block_read_us8( const __global ushort* p );
+
+void        __ovld __conv intel_sub_group_block_write_us(write_only image2d_t image, int2 coord, ushort  data);
+void        __ovld __conv intel_sub_group_block_write_us2(write_only image2d_t image, int2 coord, ushort2 data);
+void        __ovld __conv intel_sub_group_block_write_us4(write_only image2d_t image, int2 coord, ushort4 data);
+void        __ovld __conv intel_sub_group_block_write_us8(write_only image2d_t image, int2 coord, ushort8 data);
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+void        __ovld __conv intel_sub_group_block_write_us(read_write image2d_t image, int2 coord, ushort  data);
+void        __ovld __conv intel_sub_group_block_write_us2(read_write image2d_t image, int2 coord, ushort2 data);
+void        __ovld __conv intel_sub_group_block_write_us4(read_write image2d_t image, int2 coord, ushort4 data);
+void        __ovld __conv intel_sub_group_block_write_us8(read_write image2d_t image, int2 coord, ushort8 data);
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+void        __ovld __conv intel_sub_group_block_write_us(  __global ushort* p, ushort  data );
+void        __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data );
+void        __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data );
+void        __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data );
+#endif // cl_intel_subgroups_short
+
+#ifdef cl_amd_media_ops
+uint __ovld amd_bitalign(uint a, uint b, uint c);
+uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bitalign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bitalign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bitalign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bitalign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_bytealign(uint a, uint b, uint c);
+uint2 __ovld amd_bytealign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bytealign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bytealign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bytealign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bytealign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_lerp(uint a, uint b, uint c);
+uint2 __ovld amd_lerp(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_lerp(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_lerp(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_lerp(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_lerp(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_pack(float4 v);
+
+uint __ovld amd_sad4(uint4 x, uint4 y, uint z);
+
+uint __ovld amd_sadhi(uint a, uint b, uint c);
+uint2 __ovld amd_sadhi(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sadhi(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sadhi(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sadhi(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sadhi(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_sad(uint a, uint b, uint c);
+uint2 __ovld amd_sad(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sad(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sad(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sad(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sad(uint16 a, uint16 b, uint16 c);
+
+float __ovld amd_unpack0(uint a);
+float2 __ovld amd_unpack0(uint2 a);
+float3 __ovld amd_unpack0(uint3 a);
+float4 __ovld amd_unpack0(uint4 a);
+float8 __ovld amd_unpack0(uint8 a);
+float16 __ovld amd_unpack0(uint16 a);
+
+float __ovld amd_unpack1(uint a);
+float2 __ovld amd_unpack1(uint2 a);
+float3 __ovld amd_unpack1(uint3 a);
+float4 __ovld amd_unpack1(uint4 a);
+float8 __ovld amd_unpack1(uint8 a);
+float16 __ovld amd_unpack1(uint16 a);
+
+float __ovld amd_unpack2(uint a);
+float2 __ovld amd_unpack2(uint2 a);
+float3 __ovld amd_unpack2(uint3 a);
+float4 __ovld amd_unpack2(uint4 a);
+float8 __ovld amd_unpack2(uint8 a);
+float16 __ovld amd_unpack2(uint16 a);
+
+float __ovld amd_unpack3(uint a);
+float2 __ovld amd_unpack3(uint2 a);
+float3 __ovld amd_unpack3(uint3 a);
+float4 __ovld amd_unpack3(uint4 a);
+float8 __ovld amd_unpack3(uint8 a);
+float16 __ovld amd_unpack3(uint16 a);
+#endif // cl_amd_media_ops
+
+#ifdef cl_amd_media_ops2
+int __ovld amd_bfe(int src0, uint src1, uint src2);
+int2 __ovld amd_bfe(int2 src0, uint2 src1, uint2 src2);
+int3 __ovld amd_bfe(int3 src0, uint3 src1, uint3 src2);
+int4 __ovld amd_bfe(int4 src0, uint4 src1, uint4 src2);
+int8 __ovld amd_bfe(int8 src0, uint8 src1, uint8 src2);
+int16 __ovld amd_bfe(int16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfe(uint src0, uint src1, uint src2);
+uint2 __ovld amd_bfe(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_bfe(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_bfe(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_bfe(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_bfe(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfm(uint src0, uint src1);
+uint2 __ovld amd_bfm(uint2 src0, uint2 src1);
+uint3 __ovld amd_bfm(uint3 src0, uint3 src1);
+uint4 __ovld amd_bfm(uint4 src0, uint4 src1);
+uint8 __ovld amd_bfm(uint8 src0, uint8 src1);
+uint16 __ovld amd_bfm(uint16 src0, uint16 src1);
+
+float __ovld amd_max3(float src0, float src1, float src2);
+float2 __ovld amd_max3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_max3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_max3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_max3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_max3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_max3(int src0, int src1, int src2);
+int2 __ovld amd_max3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_max3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_max3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_max3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_max3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_max3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_max3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_max3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_max3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_max3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_max3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_median3(float src0, float src1, float src2);
+float2 __ovld amd_median3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_median3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_median3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_median3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_median3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_median3(int src0, int src1, int src2);
+int2 __ovld amd_median3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_median3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_median3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_median3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_median3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_median3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_median3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_median3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_median3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_median3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_median3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_min3(float src0, float src1, float src);
+float2 __ovld amd_min3(float2 src0, float2 src1, float2 src);
+float3 __ovld amd_min3(float3 src0, float3 src1, float3 src);
+float4 __ovld amd_min3(float4 src0, float4 src1, float4 src);
+float8 __ovld amd_min3(float8 src0, float8 src1, float8 src);
+float16 __ovld amd_min3(float16 src0, float16 src1, float16 src);
+
+int __ovld amd_min3(int src0, int src1, int src2);
+int2 __ovld amd_min3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_min3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_min3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_min3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_min3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_min3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_min3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_min3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_min3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_min3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_min3(uint16 src0, uint16 src1, uint16 src2);
+
+ulong __ovld amd_mqsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_mqsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_mqsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_mqsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_mqsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_mqsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+ulong __ovld amd_qsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_qsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_qsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_qsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_qsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_qsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+uint __ovld amd_msad(uint src0, uint src1, uint src2);
+uint2 __ovld amd_msad(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_msad(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_msad(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_msad(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_msad(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadd(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadd(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadd(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadd(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadd(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadd(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadw(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadw(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadw(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadw(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
+#endif // cl_amd_media_ops2
+
 // Disable any extensions we may have enabled previously.
 #pragma OPENCL EXTENSION all : disable
 
diff --git a/darwin-x86/clang-headers/pconfigintrin.h b/darwin-x86/clang-headers/pconfigintrin.h
new file mode 100644
index 0000000..fee3cad
--- /dev/null
+++ b/darwin-x86/clang-headers/pconfigintrin.h
@@ -0,0 +1,50 @@
+/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PCONFIGINTRIN_H
+#define __PCONFIGINTRIN_H
+
+#define __PCONFIG_KEY_PROGRAM 0x00000001
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("pconfig")))
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("pconfig"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/darwin-x86/clang-headers/pkuintrin.h b/darwin-x86/clang-headers/pkuintrin.h
index 9e54594..6976924 100644
--- a/darwin-x86/clang-headers/pkuintrin.h
+++ b/darwin-x86/clang-headers/pkuintrin.h
@@ -1,4 +1,4 @@
-/*===------------- pkuintrin.h - PKU intrinsics ------------------===
+/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
  *
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -40,7 +40,7 @@
 static __inline__ void __DEFAULT_FN_ATTRS
 _wrpkru(unsigned int __val)
 {
-  return __builtin_ia32_wrpkru(__val);
+  __builtin_ia32_wrpkru(__val);
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/darwin-x86/clang-headers/pmmintrin.h b/darwin-x86/clang-headers/pmmintrin.h
index 5b10580..7e1a9ea 100644
--- a/darwin-x86/clang-headers/pmmintrin.h
+++ b/darwin-x86/clang-headers/pmmintrin.h
@@ -28,16 +28,18 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+  __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
 
-/// \brief Loads data from an unaligned memory location to elements in a 128-bit
-///    vector. If the address of the data is not 16-byte aligned, the
-///    instruction may read two adjacent aligned blocks of memory to retrieve
-///    the requested data.
+/// Loads data from an unaligned memory location to elements in a 128-bit
+///    vector.
+///
+///    If the address of the data is not 16-byte aligned, the instruction may
+///    read two adjacent aligned blocks of memory to retrieve the requested
+///    data.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VLDDQU instruction.
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit integer vector containing integer values.
@@ -48,12 +50,12 @@
   return (__m128i)__builtin_ia32_lddqu((char const *)__p);
 }
 
-/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+/// Adds the even-indexed values and subtracts the odd-indexed values of
 ///    two 128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSUBPS instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the left source operand.
@@ -67,12 +69,12 @@
   return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Horizontally adds the adjacent pairs of values contained in two
+/// Horizontally adds the adjacent pairs of values contained in two
 ///    128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHADDPS instruction.
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -90,12 +92,12 @@
   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+/// Horizontally subtracts the adjacent pairs of values contained in two
 ///    128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHSUBPS instruction.
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -113,20 +115,20 @@
   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
-///    vector of [4 x float] to float values stored in a 128-bit vector of
+/// Moves and duplicates odd-indexed values from a 128-bit vector
+///    of [4 x float] to float values stored in a 128-bit vector of
 ///    [4 x float].
-///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
-///    the destination.
-///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
-///    destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSHDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
 ///
 /// \param __a
-///    A 128-bit vector of [4 x float].
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
 ///    values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -135,20 +137,19 @@
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
 }
 
-/// \brief Duplicates low-order (even-indexed) values from a 128-bit
-///    vector of [4 x float] to float values stored in a 128-bit vector of
-///    [4 x float].
-///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
-///    the destination.
-///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
-///    destination.
+/// Duplicates even-indexed values from a 128-bit vector of
+///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSLDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
 ///
 /// \param __a
-///    A 128-bit vector of [4 x float].
+///    A 128-bit vector of [4 x float] \n
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
 ///    values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -157,12 +158,12 @@
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
 }
 
-/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+/// Adds the even-indexed values and subtracts the odd-indexed values of
 ///    two 128-bit vectors of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSUBPD instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double] containing the left source operand.
@@ -176,12 +177,12 @@
   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }
 
-/// \brief Horizontally adds the pairs of values contained in two 128-bit
+/// Horizontally adds the pairs of values contained in two 128-bit
 ///    vectors of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHADDPD instruction.
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double] containing one of the source operands.
@@ -199,12 +200,12 @@
   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
 }
 
-/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
+/// Horizontally subtracts the pairs of values contained in two 128-bit
 ///    vectors of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VHSUBPD instruction.
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double] containing one of the source operands.
@@ -222,16 +223,16 @@
   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
 }
 
-/// \brief Moves and duplicates one double-precision value to double-precision
+/// Moves and duplicates one double-precision value to double-precision
 ///    values stored in a 128-bit vector of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// __m128d _mm_loaddup_pd(double const * dp);
+/// __m128d _mm_loaddup_pd(double const *dp);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VMOVDDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
 ///
 /// \param dp
 ///    A pointer to a double-precision value to be moved and duplicated.
@@ -239,13 +240,13 @@
 ///    duplicated values.
 #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
 
-/// \brief Moves and duplicates the double-precision value in the lower bits of
+/// Moves and duplicates the double-precision value in the lower bits of
 ///    a 128-bit vector of [2 x double] to double-precision values stored in a
 ///    128-bit vector of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVDDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
@@ -258,21 +259,13 @@
   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 }
 
-#define _MM_DENORMALS_ZERO_ON   (0x0040)
-#define _MM_DENORMALS_ZERO_OFF  (0x0000)
-
-#define _MM_DENORMALS_ZERO_MASK (0x0040)
-
-#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
-#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
-
-/// \brief Establishes a linear address memory range to be monitored and puts
+/// Establishes a linear address memory range to be monitored and puts
 ///    the processor in the monitor event pending state. Data stored in the
 ///    monitored address range causes the processor to exit the pending state.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MONITOR instruction.
+/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
 ///
 /// \param __p
 ///    The memory range to be monitored. The size of the range is determined by
@@ -287,13 +280,13 @@
   __builtin_ia32_monitor((void *)__p, __extensions, __hints);
 }
 
-/// \brief Used with the MONITOR instruction to wait while the processor is in
+/// Used with the MONITOR instruction to wait while the processor is in
 ///    the monitor event pending state. Data stored in the monitored address
 ///    range causes the processor to exit the pending state.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MWAIT instruction.
+/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
 ///
 /// \param __extensions
 ///    Optional extensions for the monitoring state, which may vary by
diff --git a/darwin-x86/clang-headers/popcntintrin.h b/darwin-x86/clang-headers/popcntintrin.h
index 7e2f167..75ceab9 100644
--- a/darwin-x86/clang-headers/popcntintrin.h
+++ b/darwin-x86/clang-headers/popcntintrin.h
@@ -21,17 +21,17 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _POPCNTINTRIN_H
-#define _POPCNTINTRIN_H
+#ifndef __POPCNTINTRIN_H
+#define __POPCNTINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
 
-/// \brief Counts the number of bits in the source operand having a value of 1.
+/// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
 ///
 /// \param __A
 ///    An unsigned 32-bit integer operand.
@@ -43,11 +43,11 @@
   return __builtin_popcount(__A);
 }
 
-/// \brief Counts the number of bits in the source operand having a value of 1.
+/// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
 ///
 /// \param __A
 ///    A signed 32-bit integer operand.
@@ -60,11 +60,11 @@
 }
 
 #ifdef __x86_64__
-/// \brief Counts the number of bits in the source operand having a value of 1.
+/// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
 ///
 /// \param __A
 ///    An unsigned 64-bit integer operand.
@@ -76,11 +76,11 @@
   return __builtin_popcountll(__A);
 }
 
-/// \brief Counts the number of bits in the source operand having a value of 1.
+/// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
 ///
 /// \param __A
 ///    A signed 64-bit integer operand.
@@ -95,4 +95,4 @@
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* _POPCNTINTRIN_H */
+#endif /* __POPCNTINTRIN_H */
diff --git a/darwin-x86/clang-headers/prfchwintrin.h b/darwin-x86/clang-headers/prfchwintrin.h
index ba02857..7085139 100644
--- a/darwin-x86/clang-headers/prfchwintrin.h
+++ b/darwin-x86/clang-headers/prfchwintrin.h
@@ -28,18 +28,42 @@
 #ifndef __PRFCHWINTRIN_H
 #define __PRFCHWINTRIN_H
 
-#if defined(__PRFCHW__) || defined(__3dNOW__)
+/// Loads a memory sequence containing the specified memory address into
+///    all data cache levels. The cache-coherency state is set to exclusive.
+///    Data can be read from and written to the cache line without additional
+///    delay.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _m_prefetch(void *__P)
 {
   __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
 }
 
+/// Loads a memory sequence containing the specified memory address into
+///    the L1 data cache and sets the cache-coherency to modified. This
+///    provides a hint to the processor that the cache line will be modified.
+///    It is intended for use when the cache line will be written to shortly
+///    after the prefetch is performed.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHW instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _m_prefetchw(void *__P)
 {
   __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
 }
-#endif
 
 #endif /* __PRFCHWINTRIN_H */
diff --git a/darwin-x86/clang-headers/ptwriteintrin.h b/darwin-x86/clang-headers/ptwriteintrin.h
new file mode 100644
index 0000000..1bb1df0
--- /dev/null
+++ b/darwin-x86/clang-headers/ptwriteintrin.h
@@ -0,0 +1,51 @@
+/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PTWRITEINTRIN_H
+#define __PTWRITEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("ptwrite")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_ptwrite32(unsigned int __value) {
+  __builtin_ia32_ptwrite32(__value);
+}
+
+#ifdef __x86_64__
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_ptwrite64(unsigned long long __value) {
+  __builtin_ia32_ptwrite64(__value);
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PTWRITEINTRIN_H */
diff --git a/darwin-x86/clang-headers/rdseedintrin.h b/darwin-x86/clang-headers/rdseedintrin.h
index 421f4ea..4194669 100644
--- a/darwin-x86/clang-headers/rdseedintrin.h
+++ b/darwin-x86/clang-headers/rdseedintrin.h
@@ -21,7 +21,7 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __X86INTRIN_H
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
 #error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
 #endif
 
diff --git a/darwin-x86/clang-headers/sanitizer/allocator_interface.h b/darwin-x86/clang-headers/sanitizer/allocator_interface.h
new file mode 100644
index 0000000..e44c4a1
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/allocator_interface.h
@@ -0,0 +1,89 @@
+//===-- allocator_interface.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Public interface header for allocator used in sanitizers (ASan/TSan/MSan).
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_ALLOCATOR_INTERFACE_H
+#define SANITIZER_ALLOCATOR_INTERFACE_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  /* Returns the estimated number of bytes that will be reserved by allocator
+     for request of "size" bytes. If allocator can't allocate that much
+     memory, returns the maximal possible allocation size, otherwise returns
+     "size". */
+  size_t __sanitizer_get_estimated_allocated_size(size_t size);
+
+  /* Returns true if p was returned by the allocator and
+     is not yet freed. */
+  int __sanitizer_get_ownership(const volatile void *p);
+
+  /* Returns the number of bytes reserved for the pointer p.
+     Requires (get_ownership(p) == true) or (p == 0). */
+  size_t __sanitizer_get_allocated_size(const volatile void *p);
+
+  /* Number of bytes, allocated and not yet freed by the application. */
+  size_t __sanitizer_get_current_allocated_bytes(void);
+
+  /* Number of bytes, mmaped by the allocator to fulfill allocation requests.
+     Generally, for request of X bytes, allocator can reserve and add to free
+     lists a large number of chunks of size X to use them for future requests.
+     All these chunks count toward the heap size. Currently, allocator never
+     releases memory to OS (instead, it just puts freed chunks to free
+     lists). */
+  size_t __sanitizer_get_heap_size(void);
+
+  /* Number of bytes, mmaped by the allocator, which can be used to fulfill
+     allocation requests. When a user program frees memory chunk, it can first
+     fall into quarantine and will count toward __sanitizer_get_free_bytes()
+     later. */
+  size_t __sanitizer_get_free_bytes(void);
+
+  /* Number of bytes in unmapped pages, that are released to OS. Currently,
+     always returns 0. */
+  size_t __sanitizer_get_unmapped_bytes(void);
+
+  /* Malloc hooks that may be optionally provided by user.
+     __sanitizer_malloc_hook(ptr, size) is called immediately after
+       allocation of "size" bytes, which returned "ptr".
+     __sanitizer_free_hook(ptr) is called immediately before
+       deallocation of "ptr". */
+  void __sanitizer_malloc_hook(const volatile void *ptr, size_t size);
+  void __sanitizer_free_hook(const volatile void *ptr);
+
+  /* Installs a pair of hooks for malloc/free.
+     Several (currently, 5) hook pairs may be installed, they are executed
+     in the order they were installed and after calling
+     __sanitizer_malloc_hook/__sanitizer_free_hook.
+     Unlike __sanitizer_malloc_hook/__sanitizer_free_hook these hooks can be
+     chained and do not rely on weak symbols working on the platform, but
+     require __sanitizer_install_malloc_and_free_hooks to be called at startup
+     and thus will not be called on malloc/free very early in the process.
+     Returns the number of hooks currently installed or 0 on failure.
+     Not thread-safe, should be called in the main thread before starting
+     other threads.
+  */
+  int __sanitizer_install_malloc_and_free_hooks(
+      void (*malloc_hook)(const volatile void *, size_t),
+      void (*free_hook)(const volatile void *));
+
+  /* Drains allocator quarantines (calling thread's and global ones), returns
+     freed memory back to OS and releases other non-essential internal allocator
+     resources in attempt to reduce process RSS.
+     Currently available with ASan only.
+  */
+  void __sanitizer_purge_allocator(void);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/darwin-x86/clang-headers/sanitizer/asan_interface.h b/darwin-x86/clang-headers/sanitizer/asan_interface.h
new file mode 100644
index 0000000..f2d7714
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/asan_interface.h
@@ -0,0 +1,155 @@
+//===-- sanitizer/asan_interface.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_ASAN_INTERFACE_H
+#define SANITIZER_ASAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // Marks memory region [addr, addr+size) as unaddressable.
+  // This memory must be previously allocated by the user program. Accessing
+  // addresses in this region from instrumented code is forbidden until
+  // this region is unpoisoned. This function is not guaranteed to poison
+  // the whole region - it may poison only subregion of [addr, addr+size) due
+  // to ASan alignment restrictions.
+  // Method is NOT thread-safe in the sense that no two threads can
+  // (un)poison memory in the same memory region simultaneously.
+  void __asan_poison_memory_region(void const volatile *addr, size_t size);
+  // Marks memory region [addr, addr+size) as addressable.
+  // This memory must be previously allocated by the user program. Accessing
+  // addresses in this region is allowed until this region is poisoned again.
+  // This function may unpoison a superregion of [addr, addr+size) due to
+  // ASan alignment restrictions.
+  // Method is NOT thread-safe in the sense that no two threads can
+  // (un)poison memory in the same memory region simultaneously.
+  void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+
+// User code should use macros instead of functions.
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+  __asan_poison_memory_region((addr), (size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+  __asan_unpoison_memory_region((addr), (size))
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+  ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+  ((void)(addr), (void)(size))
+#endif
+
+  // Returns 1 if addr is poisoned (i.e. 1-byte read/write access to this
+  // address will result in error report from AddressSanitizer).
+  // Otherwise returns 0.
+  int __asan_address_is_poisoned(void const volatile *addr);
+
+  // If at least one byte in [beg, beg+size) is poisoned, return the address
+  // of the first such byte. Otherwise return 0.
+  void *__asan_region_is_poisoned(void *beg, size_t size);
+
+  // Print the description of addr (useful when debugging in gdb).
+  void __asan_describe_address(void *addr);
+
+  // Useful for calling from a debugger to get information about an ASan error.
+  // Returns 1 if an error has been (or is being) reported, otherwise returns 0.
+  int __asan_report_present(void);
+
+  // Useful for calling from a debugger to get information about an ASan error.
+  // If an error has been (or is being) reported, the following functions return
+  // the pc, bp, sp, address, access type (0 = read, 1 = write), access size and
+  // bug description (e.g. "heap-use-after-free"). Otherwise they return 0.
+  void *__asan_get_report_pc(void);
+  void *__asan_get_report_bp(void);
+  void *__asan_get_report_sp(void);
+  void *__asan_get_report_address(void);
+  int __asan_get_report_access_type(void);
+  size_t __asan_get_report_access_size(void);
+  const char *__asan_get_report_description(void);
+
+  // Useful for calling from the debugger to get information about a pointer.
+  // Returns the category of the given pointer as a constant string.
+  // Possible return values are "global", "stack", "stack-fake", "heap",
+  // "heap-invalid", "shadow-low", "shadow-gap", "shadow-high", "unknown".
+  // If global or stack, tries to also return the variable name, address and
+  // size. If heap, tries to return the chunk address and size. 'name' should
+  // point to an allocated buffer of size 'name_size'.
+  const char *__asan_locate_address(void *addr, char *name, size_t name_size,
+                                    void **region_address, size_t *region_size);
+
+  // Useful for calling from the debugger to get the allocation stack trace
+  // and thread ID for a heap address. Stores up to 'size' frames into 'trace',
+  // returns the number of stored frames or 0 on error.
+  size_t __asan_get_alloc_stack(void *addr, void **trace, size_t size,
+                                int *thread_id);
+
+  // Useful for calling from the debugger to get the free stack trace
+  // and thread ID for a heap address. Stores up to 'size' frames into 'trace',
+  // returns the number of stored frames or 0 on error.
+  size_t __asan_get_free_stack(void *addr, void **trace, size_t size,
+                               int *thread_id);
+
+  // Useful for calling from the debugger to get the current shadow memory
+  // mapping.
+  void __asan_get_shadow_mapping(size_t *shadow_scale, size_t *shadow_offset);
+
+  // This is an internal function that is called to report an error.
+  // However it is still a part of the interface because users may want to
+  // set a breakpoint on this function in a debugger.
+  void __asan_report_error(void *pc, void *bp, void *sp,
+                           void *addr, int is_write, size_t access_size);
+
+  // Deprecated. Call __sanitizer_set_death_callback instead.
+  void __asan_set_death_callback(void (*callback)(void));
+
+  void __asan_set_error_report_callback(void (*callback)(const char*));
+
+  // User may provide function that would be called right when ASan detects
+  // an error. This can be used to notice cases when ASan detects an error, but
+  // the program crashes before ASan report is printed.
+  void __asan_on_error(void);
+
+  // Prints accumulated stats to stderr. Used for debugging.
+  void __asan_print_accumulated_stats(void);
+
+  // This function may be optionally provided by user and should return
+  // a string containing ASan runtime options. See asan_flags.h for details.
+  const char* __asan_default_options(void);
+
+  // The following 2 functions facilitate garbage collection in presence of
+  // asan's fake stack.
+
+  // Returns an opaque handler to be used later in __asan_addr_is_in_fake_stack.
+  // Returns NULL if the current thread does not have a fake stack.
+  void *__asan_get_current_fake_stack(void);
+
+  // If fake_stack is non-NULL and addr belongs to a fake frame in
+  // fake_stack, returns the address on real stack that corresponds to
+  // the fake frame and sets beg/end to the boundaries of this fake frame.
+  // Otherwise returns NULL and does not touch beg/end.
+  // If beg/end are NULL, they are not touched.
+  // This function may be called from a thread other than the owner of
+  // fake_stack, but the owner thread need to be alive.
+  void *__asan_addr_is_in_fake_stack(void *fake_stack, void *addr, void **beg,
+                                     void **end);
+
+  // Performs cleanup before a [[noreturn]] function.  Must be called
+  // before things like _exit and execl to avoid false positives on stack.
+  void __asan_handle_no_return(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_ASAN_INTERFACE_H
diff --git a/darwin-x86/clang-headers/sanitizer/common_interface_defs.h b/darwin-x86/clang-headers/sanitizer/common_interface_defs.h
new file mode 100644
index 0000000..d11cb1a
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/common_interface_defs.h
@@ -0,0 +1,203 @@
+//===-- sanitizer/common_interface_defs.h -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common part of the public sanitizer interface.
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_COMMON_INTERFACE_DEFS_H
+#define SANITIZER_COMMON_INTERFACE_DEFS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+// GCC does not understand __has_feature.
+#if !defined(__has_feature)
+# define __has_feature(x) 0
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // Arguments for __sanitizer_sandbox_on_notify() below.
+  typedef struct {
+    // Enable sandbox support in sanitizer coverage.
+    int coverage_sandboxed;
+    // File descriptor to write coverage data to. If -1 is passed, a file will
+    // be pre-opened by __sanitizer_sandobx_on_notify(). This field has no
+    // effect if coverage_sandboxed == 0.
+    intptr_t coverage_fd;
+    // If non-zero, split the coverage data into well-formed blocks. This is
+    // useful when coverage_fd is a socket descriptor. Each block will contain
+    // a header, allowing data from multiple processes to be sent over the same
+    // socket.
+    unsigned int coverage_max_block_size;
+  } __sanitizer_sandbox_arguments;
+
+  // Tell the tools to write their reports to "path.<pid>" instead of stderr.
+  void __sanitizer_set_report_path(const char *path);
+  // Tell the tools to write their reports to the provided file descriptor
+  // (casted to void *).
+  void __sanitizer_set_report_fd(void *fd);
+
+  // Notify the tools that the sandbox is going to be turned on. The reserved
+  // parameter will be used in the future to hold a structure with functions
+  // that the tools may call to bypass the sandbox.
+  void __sanitizer_sandbox_on_notify(__sanitizer_sandbox_arguments *args);
+
+  // This function is called by the tool when it has just finished reporting
+  // an error. 'error_summary' is a one-line string that summarizes
+  // the error message. This function can be overridden by the client.
+  void __sanitizer_report_error_summary(const char *error_summary);
+
+  // Some of the sanitizers (e.g. asan/tsan) may miss bugs that happen
+  // in unaligned loads/stores. In order to find such bugs reliably one needs
+  // to replace plain unaligned loads/stores with these calls.
+  uint16_t __sanitizer_unaligned_load16(const void *p);
+  uint32_t __sanitizer_unaligned_load32(const void *p);
+  uint64_t __sanitizer_unaligned_load64(const void *p);
+  void __sanitizer_unaligned_store16(void *p, uint16_t x);
+  void __sanitizer_unaligned_store32(void *p, uint32_t x);
+  void __sanitizer_unaligned_store64(void *p, uint64_t x);
+
+  // Returns 1 on the first call, then returns 0 thereafter.  Called by the tool
+  // to ensure only one report is printed when multiple errors occur
+  // simultaneously.
+  int __sanitizer_acquire_crash_state();
+
+  // Annotate the current state of a contiguous container, such as
+  // std::vector, std::string or similar.
+  // A contiguous container is a container that keeps all of its elements
+  // in a contiguous region of memory. The container owns the region of memory
+  // [beg, end); the memory [beg, mid) is used to store the current elements
+  // and the memory [mid, end) is reserved for future elements;
+  // beg <= mid <= end. For example, in "std::vector<> v"
+  //   beg = &v[0];
+  //   end = beg + v.capacity() * sizeof(v[0]);
+  //   mid = beg + v.size()     * sizeof(v[0]);
+  //
+  // This annotation tells the Sanitizer tool about the current state of the
+  // container so that the tool can report errors when memory from [mid, end)
+  // is accessed. Insert this annotation into methods like push_back/pop_back.
+  // Supply the old and the new values of mid (old_mid/new_mid).
+  // In the initial state mid == end and so should be the final
+  // state when the container is destroyed or when it reallocates the storage.
+  //
+  // Use with caution and don't use for anything other than vector-like classes.
+  //
+  // For AddressSanitizer, 'beg' should be 8-aligned and 'end' should
+  // be either 8-aligned or it should point to the end of a separate heap-,
+  // stack-, or global- allocated buffer. I.e. the following will not work:
+  //   int64_t x[2];  // 16 bytes, 8-aligned.
+  //   char *beg = (char *)&x[0];
+  //   char *end = beg + 12;  // Not 8 aligned, not the end of the buffer.
+  // This however will work fine:
+  //   int32_t x[3];  // 12 bytes, but 8-aligned under AddressSanitizer.
+  //   char *beg = (char*)&x[0];
+  //   char *end = beg + 12;  // Not 8-aligned, but is the end of the buffer.
+  void __sanitizer_annotate_contiguous_container(const void *beg,
+                                                 const void *end,
+                                                 const void *old_mid,
+                                                 const void *new_mid);
+  // Returns true if the contiguous container [beg, end) is properly poisoned
+  // (e.g. with __sanitizer_annotate_contiguous_container), i.e. if
+  //  - [beg, mid) is addressable,
+  //  - [mid, end) is unaddressable.
+  // Full verification requires O(end-beg) time; this function tries to avoid
+  // such complexity by touching only parts of the container around beg/mid/end.
+  int __sanitizer_verify_contiguous_container(const void *beg, const void *mid,
+                                              const void *end);
+
+  // Similar to __sanitizer_verify_contiguous_container but returns the address
+  // of the first improperly poisoned byte otherwise. Returns null if the area
+  // is poisoned properly.
+  const void *__sanitizer_contiguous_container_find_bad_address(
+      const void *beg, const void *mid, const void *end);
+
+  // Print the stack trace leading to this call. Useful for debugging user code.
+  void __sanitizer_print_stack_trace(void);
+
+  // Symbolizes the supplied 'pc' using the format string 'fmt'.
+  // Outputs at most 'out_buf_size' bytes into 'out_buf'.
+  // The format syntax is described in
+  // lib/sanitizer_common/sanitizer_stacktrace_printer.h.
+  void __sanitizer_symbolize_pc(void *pc, const char *fmt, char *out_buf,
+                                size_t out_buf_size);
+  // Same as __sanitizer_symbolize_pc, but for data section (i.e. globals).
+  void __sanitizer_symbolize_global(void *data_ptr, const char *fmt,
+                                    char *out_buf, size_t out_buf_size);
+
+  // Sets the callback to be called right before death on error.
+  // Passing 0 will unset the callback.
+  void __sanitizer_set_death_callback(void (*callback)(void));
+
+  // Interceptor hooks.
+  // Whenever a libc function interceptor is called it checks if the
+  // corresponding weak hook is defined, and it so -- calls it.
+  // The primary use case is data-flow-guided fuzzing, where the fuzzer needs
+  // to know what is being passed to libc functions, e.g. memcmp.
+  // FIXME: implement more hooks.
+  void __sanitizer_weak_hook_memcmp(void *called_pc, const void *s1,
+                                    const void *s2, size_t n, int result);
+  void __sanitizer_weak_hook_strncmp(void *called_pc, const char *s1,
+                                    const char *s2, size_t n, int result);
+  void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
+                                         const char *s2, size_t n, int result);
+  void __sanitizer_weak_hook_strcmp(void *called_pc, const char *s1,
+                                    const char *s2, int result);
+  void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
+                                        const char *s2, int result);
+  void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
+                                    const char *s2, char *result);
+  void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
+                                        const char *s2, char *result);
+  void __sanitizer_weak_hook_memmem(void *called_pc,
+                                    const void *s1, size_t len1,
+                                    const void *s2, size_t len2, void *result);
+
+  // Prints stack traces for all live heap allocations ordered by total
+  // allocation size until `top_percent` of total live heap is shown.
+  // `top_percent` should be between 1 and 100.
+  // At most `max_number_of_contexts` contexts (stack traces) is printed.
+  // Experimental feature currently available only with asan on Linux/x86_64.
+  void __sanitizer_print_memory_profile(size_t top_percent,
+                                        size_t max_number_of_contexts);
+
+  // Fiber annotation interface.
+  // Before switching to a different stack, one must call
+  // __sanitizer_start_switch_fiber with a pointer to the bottom of the
+  // destination stack and its size. When code starts running on the new stack,
+  // it must call __sanitizer_finish_switch_fiber to finalize the switch.
+  // The start_switch function takes a void** to store the current fake stack if
+  // there is one (it is needed when detect_stack_use_after_return is enabled).
+  // When restoring a stack, this pointer must be given to the finish_switch
+  // function. In most cases, this void* can be stored on the stack just before
+  // switching.  When leaving a fiber definitely, null must be passed as first
+  // argument to the start_switch function so that the fake stack is destroyed.
+  // If you do not want support for stack use-after-return detection, you can
+  // always pass null to these two functions.
+  // Note that the fake stack mechanism is disabled during fiber switch, so if a
+  // signal callback runs during the switch, it will not benefit from the stack
+  // use-after-return detection.
+  void __sanitizer_start_switch_fiber(void **fake_stack_save,
+                                      const void *bottom, size_t size);
+  void __sanitizer_finish_switch_fiber(void *fake_stack_save,
+                                       const void **bottom_old,
+                                       size_t *size_old);
+
+  // Get full module name and calculate pc offset within it.
+  // Returns 1 if pc belongs to some module, 0 if module was not found.
+  int __sanitizer_get_module_and_offset_for_pc(void *pc, char *module_path,
+                                               size_t module_path_len,
+                                               void **pc_offset);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_COMMON_INTERFACE_DEFS_H
diff --git a/darwin-x86/clang-headers/sanitizer/coverage_interface.h b/darwin-x86/clang-headers/sanitizer/coverage_interface.h
new file mode 100644
index 0000000..bc95a5c
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/coverage_interface.h
@@ -0,0 +1,36 @@
+//===-- sanitizer/coverage_interface.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Public interface for sanitizer coverage.
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_COVERAG_INTERFACE_H
+#define SANITIZER_COVERAG_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  // Record and dump coverage info.
+  void __sanitizer_cov_dump(void);
+
+  // Clear collected coverage info.
+  void __sanitizer_cov_reset(void);
+
+  // Dump collected coverage info. Sorts pcs by module into individual .sancov
+  // files.
+  void __sanitizer_dump_coverage(const uintptr_t *pcs, uintptr_t len);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_COVERAG_INTERFACE_H
diff --git a/darwin-x86/clang-headers/sanitizer/dfsan_interface.h b/darwin-x86/clang-headers/sanitizer/dfsan_interface.h
new file mode 100644
index 0000000..05666f7
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/dfsan_interface.h
@@ -0,0 +1,116 @@
+//===-- dfsan_interface.h -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of DataFlowSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef DFSAN_INTERFACE_H
+#define DFSAN_INTERFACE_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t dfsan_label;
+
+/// Stores information associated with a specific label identifier.  A label
+/// may be a base label created using dfsan_create_label, with associated
+/// text description and user data, or an automatically created union label,
+/// which represents the union of two label identifiers (which may themselves
+/// be base or union labels).
+struct dfsan_label_info {
+  // Fields for union labels, set to 0 for base labels.
+  dfsan_label l1;
+  dfsan_label l2;
+
+  // Fields for base labels.
+  const char *desc;
+  void *userdata;
+};
+
+/// Signature of the callback argument to dfsan_set_write_callback().
+typedef void (*dfsan_write_callback_t)(int fd, const void *buf, size_t count);
+
+/// Computes the union of \c l1 and \c l2, possibly creating a union label in
+/// the process.
+dfsan_label dfsan_union(dfsan_label l1, dfsan_label l2);
+
+/// Creates and returns a base label with the given description and user data.
+dfsan_label dfsan_create_label(const char *desc, void *userdata);
+
+/// Sets the label for each address in [addr,addr+size) to \c label.
+void dfsan_set_label(dfsan_label label, void *addr, size_t size);
+
+/// Sets the label for each address in [addr,addr+size) to the union of the
+/// current label for that address and \c label.
+void dfsan_add_label(dfsan_label label, void *addr, size_t size);
+
+/// Retrieves the label associated with the given data.
+///
+/// The type of 'data' is arbitrary.  The function accepts a value of any type,
+/// which can be truncated or extended (implicitly or explicitly) as necessary.
+/// The truncation/extension operations will preserve the label of the original
+/// value.
+dfsan_label dfsan_get_label(long data);
+
+/// Retrieves the label associated with the data at the given address.
+dfsan_label dfsan_read_label(const void *addr, size_t size);
+
+/// Retrieves a pointer to the dfsan_label_info struct for the given label.
+const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label);
+
+/// Returns whether the given label label contains the label elem.
+int dfsan_has_label(dfsan_label label, dfsan_label elem);
+
+/// If the given label label contains a label with the description desc, returns
+/// that label, else returns 0.
+dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc);
+
+/// Returns the number of labels allocated.
+size_t dfsan_get_label_count(void);
+
+/// Sets a callback to be invoked on calls to write().  The callback is invoked
+/// before the write is done.  The write is not guaranteed to succeed when the
+/// callback executes.  Pass in NULL to remove any callback.
+void dfsan_set_write_callback(dfsan_write_callback_t labeled_write_callback);
+
+/// Writes the labels currently used by the program to the given file
+/// descriptor. The lines of the output have the following format:
+///
+/// <label> <parent label 1> <parent label 2> <label description if any>
+void dfsan_dump_labels(int fd);
+
+/// Interceptor hooks.
+/// Whenever a dfsan's custom function is called the corresponding
+/// hook is called it non-zero. The hooks should be defined by the user.
+/// The primary use case is taint-guided fuzzing, where the fuzzer
+/// needs to see the parameters of the function and the labels.
+/// FIXME: implement more hooks.
+void dfsan_weak_hook_memcmp(void *caller_pc, const void *s1, const void *s2,
+                            size_t n, dfsan_label s1_label,
+                            dfsan_label s2_label, dfsan_label n_label);
+void dfsan_weak_hook_strncmp(void *caller_pc, const char *s1, const char *s2,
+                             size_t n, dfsan_label s1_label,
+                             dfsan_label s2_label, dfsan_label n_label);
+#ifdef __cplusplus
+}  // extern "C"
+
+template <typename T>
+void dfsan_set_label(dfsan_label label, T &data) {  // NOLINT
+  dfsan_set_label(label, (void *)&data, sizeof(T));
+}
+
+#endif
+
+#endif  // DFSAN_INTERFACE_H
diff --git a/darwin-x86/clang-headers/sanitizer/esan_interface.h b/darwin-x86/clang-headers/sanitizer/esan_interface.h
new file mode 100644
index 0000000..c755ed3
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/esan_interface.h
@@ -0,0 +1,50 @@
+//===-- sanitizer/esan_interface.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_ESAN_INTERFACE_H
+#define SANITIZER_ESAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+// We declare our interface routines as weak to allow the user to avoid
+// ifdefs and instead use this pattern to allow building the same sources
+// with and without our runtime library:
+//     if (__esan_report)
+//       __esan_report();
+#ifdef _MSC_VER
+/* selectany is as close to weak as we'll get. */
+#define COMPILER_RT_WEAK __declspec(selectany)
+#elif __GNUC__
+#define COMPILER_RT_WEAK __attribute__((weak))
+#else
+#define COMPILER_RT_WEAK
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This function can be called mid-run (or at the end of a run for
+// a server process that doesn't shut down normally) to request that
+// data for that point in the run be reported from the tool.
+void COMPILER_RT_WEAK __esan_report(void);
+
+// This function returns the number of samples that the esan tool has collected
+// to this point.  This is useful for testing.
+unsigned int COMPILER_RT_WEAK __esan_get_sample_count(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // SANITIZER_ESAN_INTERFACE_H
diff --git a/darwin-x86/clang-headers/sanitizer/hwasan_interface.h b/darwin-x86/clang-headers/sanitizer/hwasan_interface.h
new file mode 100644
index 0000000..79bf23e
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/hwasan_interface.h
@@ -0,0 +1,81 @@
+//===-- sanitizer/asan_interface.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of HWAddressSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_HWASAN_INTERFACE_H
+#define SANITIZER_HWASAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // Initialize shadow but not the rest of the runtime.
+  // Does not call libc unless there is an error.
+  // Can be called multiple times, or not at all (in which case shadow will
+  // be initialized in compiler-inserted __hwasan_init() call).
+  void __hwasan_shadow_init(void);
+
+  // This function may be optionally provided by user and should return
+  // a string containing HWASan runtime options. See asan_flags.h for details.
+  const char* __hwasan_default_options(void);
+
+  void __hwasan_enable_allocator_tagging(void);
+  void __hwasan_disable_allocator_tagging(void);
+
+  // Mark region of memory with the given tag. Both address and size need to be
+  // 16-byte aligned.
+  void __hwasan_tag_memory(const volatile void *p, unsigned char tag,
+                           size_t size);
+
+  /// Set pointer tag. Previous tag is lost.
+  void *__hwasan_tag_pointer(const volatile void *p, unsigned char tag);
+
+  // Set memory tag from the current SP address to the given address to zero.
+  // This is meant to annotate longjmp and other non-local jumps.
+  // This function needs to know the (almost) exact destination frame address;
+  // clearing shadow for the entire thread stack like __asan_handle_no_return
+  // does would cause false reports.
+  void __hwasan_handle_longjmp(const void *sp_dst);
+
+  // Libc hook for thread creation. Should be called in the child thread before
+  // any instrumented code.
+  void __hwasan_thread_enter();
+
+  // Libc hook for thread destruction. No instrumented code should run after
+  // this call.
+  void __hwasan_thread_exit();
+
+  // Print shadow and origin for the memory range to stderr in a human-readable
+  // format.
+  void __hwasan_print_shadow(const volatile void *x, size_t size);
+
+  int __sanitizer_posix_memalign(void **memptr, size_t alignment, size_t size);
+  void * __sanitizer_memalign(size_t alignment, size_t size);
+  void * __sanitizer_aligned_alloc(size_t alignment, size_t size);
+  void * __sanitizer___libc_memalign(size_t alignment, size_t size);
+  void * __sanitizer_valloc(size_t size);
+  void * __sanitizer_pvalloc(size_t size);
+  void __sanitizer_free(void *ptr);
+  void __sanitizer_cfree(void *ptr);
+  size_t __sanitizer_malloc_usable_size(const void *ptr);
+  struct mallinfo __sanitizer_mallinfo();
+  int __sanitizer_mallopt(int cmd, int value);
+  void __sanitizer_malloc_stats(void);
+  void * __sanitizer_calloc(size_t nmemb, size_t size);
+  void * __sanitizer_realloc(void *ptr, size_t size);
+  void * __sanitizer_malloc(size_t size);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_HWASAN_INTERFACE_H
diff --git a/darwin-x86/clang-headers/sanitizer/linux_syscall_hooks.h b/darwin-x86/clang-headers/sanitizer/linux_syscall_hooks.h
new file mode 100644
index 0000000..09f261d
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/linux_syscall_hooks.h
@@ -0,0 +1,3083 @@
+//===-- linux_syscall_hooks.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of public sanitizer interface.
+//
+// System call handlers.
+//
+// Interface methods declared in this header implement pre- and post- syscall
+// actions for the active sanitizer.
+// Usage:
+//   __sanitizer_syscall_pre_getfoo(...args...);
+//   long res = syscall(__NR_getfoo, ...args...);
+//   __sanitizer_syscall_post_getfoo(res, ...args...);
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_LINUX_SYSCALL_HOOKS_H
+#define SANITIZER_LINUX_SYSCALL_HOOKS_H
+
+#define __sanitizer_syscall_pre_time(tloc) \
+  __sanitizer_syscall_pre_impl_time((long)(tloc))
+#define __sanitizer_syscall_post_time(res, tloc) \
+  __sanitizer_syscall_post_impl_time(res, (long)(tloc))
+#define __sanitizer_syscall_pre_stime(tptr) \
+  __sanitizer_syscall_pre_impl_stime((long)(tptr))
+#define __sanitizer_syscall_post_stime(res, tptr) \
+  __sanitizer_syscall_post_impl_stime(res, (long)(tptr))
+#define __sanitizer_syscall_pre_gettimeofday(tv, tz) \
+  __sanitizer_syscall_pre_impl_gettimeofday((long)(tv), (long)(tz))
+#define __sanitizer_syscall_post_gettimeofday(res, tv, tz) \
+  __sanitizer_syscall_post_impl_gettimeofday(res, (long)(tv), (long)(tz))
+#define __sanitizer_syscall_pre_settimeofday(tv, tz) \
+  __sanitizer_syscall_pre_impl_settimeofday((long)(tv), (long)(tz))
+#define __sanitizer_syscall_post_settimeofday(res, tv, tz) \
+  __sanitizer_syscall_post_impl_settimeofday(res, (long)(tv), (long)(tz))
+#define __sanitizer_syscall_pre_adjtimex(txc_p) \
+  __sanitizer_syscall_pre_impl_adjtimex((long)(txc_p))
+#define __sanitizer_syscall_post_adjtimex(res, txc_p) \
+  __sanitizer_syscall_post_impl_adjtimex(res, (long)(txc_p))
+#define __sanitizer_syscall_pre_times(tbuf) \
+  __sanitizer_syscall_pre_impl_times((long)(tbuf))
+#define __sanitizer_syscall_post_times(res, tbuf) \
+  __sanitizer_syscall_post_impl_times(res, (long)(tbuf))
+#define __sanitizer_syscall_pre_gettid() __sanitizer_syscall_pre_impl_gettid()
+#define __sanitizer_syscall_post_gettid(res) \
+  __sanitizer_syscall_post_impl_gettid(res)
+#define __sanitizer_syscall_pre_nanosleep(rqtp, rmtp) \
+  __sanitizer_syscall_pre_impl_nanosleep((long)(rqtp), (long)(rmtp))
+#define __sanitizer_syscall_post_nanosleep(res, rqtp, rmtp) \
+  __sanitizer_syscall_post_impl_nanosleep(res, (long)(rqtp), (long)(rmtp))
+#define __sanitizer_syscall_pre_alarm(seconds) \
+  __sanitizer_syscall_pre_impl_alarm((long)(seconds))
+#define __sanitizer_syscall_post_alarm(res, seconds) \
+  __sanitizer_syscall_post_impl_alarm(res, (long)(seconds))
+#define __sanitizer_syscall_pre_getpid() __sanitizer_syscall_pre_impl_getpid()
+#define __sanitizer_syscall_post_getpid(res) \
+  __sanitizer_syscall_post_impl_getpid(res)
+#define __sanitizer_syscall_pre_getppid() __sanitizer_syscall_pre_impl_getppid()
+#define __sanitizer_syscall_post_getppid(res) \
+  __sanitizer_syscall_post_impl_getppid(res)
+#define __sanitizer_syscall_pre_getuid() __sanitizer_syscall_pre_impl_getuid()
+#define __sanitizer_syscall_post_getuid(res) \
+  __sanitizer_syscall_post_impl_getuid(res)
+#define __sanitizer_syscall_pre_geteuid() __sanitizer_syscall_pre_impl_geteuid()
+#define __sanitizer_syscall_post_geteuid(res) \
+  __sanitizer_syscall_post_impl_geteuid(res)
+#define __sanitizer_syscall_pre_getgid() __sanitizer_syscall_pre_impl_getgid()
+#define __sanitizer_syscall_post_getgid(res) \
+  __sanitizer_syscall_post_impl_getgid(res)
+#define __sanitizer_syscall_pre_getegid() __sanitizer_syscall_pre_impl_getegid()
+#define __sanitizer_syscall_post_getegid(res) \
+  __sanitizer_syscall_post_impl_getegid(res)
+#define __sanitizer_syscall_pre_getresuid(ruid, euid, suid)          \
+  __sanitizer_syscall_pre_impl_getresuid((long)(ruid), (long)(euid), \
+                                         (long)(suid))
+#define __sanitizer_syscall_post_getresuid(res, ruid, euid, suid)          \
+  __sanitizer_syscall_post_impl_getresuid(res, (long)(ruid), (long)(euid), \
+                                          (long)(suid))
+#define __sanitizer_syscall_pre_getresgid(rgid, egid, sgid)          \
+  __sanitizer_syscall_pre_impl_getresgid((long)(rgid), (long)(egid), \
+                                         (long)(sgid))
+#define __sanitizer_syscall_post_getresgid(res, rgid, egid, sgid)          \
+  __sanitizer_syscall_post_impl_getresgid(res, (long)(rgid), (long)(egid), \
+                                          (long)(sgid))
+#define __sanitizer_syscall_pre_getpgid(pid) \
+  __sanitizer_syscall_pre_impl_getpgid((long)(pid))
+#define __sanitizer_syscall_post_getpgid(res, pid) \
+  __sanitizer_syscall_post_impl_getpgid(res, (long)(pid))
+#define __sanitizer_syscall_pre_getpgrp() __sanitizer_syscall_pre_impl_getpgrp()
+#define __sanitizer_syscall_post_getpgrp(res) \
+  __sanitizer_syscall_post_impl_getpgrp(res)
+#define __sanitizer_syscall_pre_getsid(pid) \
+  __sanitizer_syscall_pre_impl_getsid((long)(pid))
+#define __sanitizer_syscall_post_getsid(res, pid) \
+  __sanitizer_syscall_post_impl_getsid(res, (long)(pid))
+#define __sanitizer_syscall_pre_getgroups(gidsetsize, grouplist) \
+  __sanitizer_syscall_pre_impl_getgroups((long)(gidsetsize), (long)(grouplist))
+#define __sanitizer_syscall_post_getgroups(res, gidsetsize, grouplist) \
+  __sanitizer_syscall_post_impl_getgroups(res, (long)(gidsetsize),     \
+                                          (long)(grouplist))
+#define __sanitizer_syscall_pre_setregid(rgid, egid) \
+  __sanitizer_syscall_pre_impl_setregid((long)(rgid), (long)(egid))
+#define __sanitizer_syscall_post_setregid(res, rgid, egid) \
+  __sanitizer_syscall_post_impl_setregid(res, (long)(rgid), (long)(egid))
+#define __sanitizer_syscall_pre_setgid(gid) \
+  __sanitizer_syscall_pre_impl_setgid((long)(gid))
+#define __sanitizer_syscall_post_setgid(res, gid) \
+  __sanitizer_syscall_post_impl_setgid(res, (long)(gid))
+#define __sanitizer_syscall_pre_setreuid(ruid, euid) \
+  __sanitizer_syscall_pre_impl_setreuid((long)(ruid), (long)(euid))
+#define __sanitizer_syscall_post_setreuid(res, ruid, euid) \
+  __sanitizer_syscall_post_impl_setreuid(res, (long)(ruid), (long)(euid))
+#define __sanitizer_syscall_pre_setuid(uid) \
+  __sanitizer_syscall_pre_impl_setuid((long)(uid))
+#define __sanitizer_syscall_post_setuid(res, uid) \
+  __sanitizer_syscall_post_impl_setuid(res, (long)(uid))
+#define __sanitizer_syscall_pre_setresuid(ruid, euid, suid)          \
+  __sanitizer_syscall_pre_impl_setresuid((long)(ruid), (long)(euid), \
+                                         (long)(suid))
+#define __sanitizer_syscall_post_setresuid(res, ruid, euid, suid)          \
+  __sanitizer_syscall_post_impl_setresuid(res, (long)(ruid), (long)(euid), \
+                                          (long)(suid))
+#define __sanitizer_syscall_pre_setresgid(rgid, egid, sgid)          \
+  __sanitizer_syscall_pre_impl_setresgid((long)(rgid), (long)(egid), \
+                                         (long)(sgid))
+#define __sanitizer_syscall_post_setresgid(res, rgid, egid, sgid)          \
+  __sanitizer_syscall_post_impl_setresgid(res, (long)(rgid), (long)(egid), \
+                                          (long)(sgid))
+#define __sanitizer_syscall_pre_setfsuid(uid) \
+  __sanitizer_syscall_pre_impl_setfsuid((long)(uid))
+#define __sanitizer_syscall_post_setfsuid(res, uid) \
+  __sanitizer_syscall_post_impl_setfsuid(res, (long)(uid))
+#define __sanitizer_syscall_pre_setfsgid(gid) \
+  __sanitizer_syscall_pre_impl_setfsgid((long)(gid))
+#define __sanitizer_syscall_post_setfsgid(res, gid) \
+  __sanitizer_syscall_post_impl_setfsgid(res, (long)(gid))
+#define __sanitizer_syscall_pre_setpgid(pid, pgid) \
+  __sanitizer_syscall_pre_impl_setpgid((long)(pid), (long)(pgid))
+#define __sanitizer_syscall_post_setpgid(res, pid, pgid) \
+  __sanitizer_syscall_post_impl_setpgid(res, (long)(pid), (long)(pgid))
+#define __sanitizer_syscall_pre_setsid() __sanitizer_syscall_pre_impl_setsid()
+#define __sanitizer_syscall_post_setsid(res) \
+  __sanitizer_syscall_post_impl_setsid(res)
+#define __sanitizer_syscall_pre_setgroups(gidsetsize, grouplist) \
+  __sanitizer_syscall_pre_impl_setgroups((long)(gidsetsize), (long)(grouplist))
+#define __sanitizer_syscall_post_setgroups(res, gidsetsize, grouplist) \
+  __sanitizer_syscall_post_impl_setgroups(res, (long)(gidsetsize),     \
+                                          (long)(grouplist))
+#define __sanitizer_syscall_pre_acct(name) \
+  __sanitizer_syscall_pre_impl_acct((long)(name))
+#define __sanitizer_syscall_post_acct(res, name) \
+  __sanitizer_syscall_post_impl_acct(res, (long)(name))
+#define __sanitizer_syscall_pre_capget(header, dataptr) \
+  __sanitizer_syscall_pre_impl_capget((long)(header), (long)(dataptr))
+#define __sanitizer_syscall_post_capget(res, header, dataptr) \
+  __sanitizer_syscall_post_impl_capget(res, (long)(header), (long)(dataptr))
+#define __sanitizer_syscall_pre_capset(header, data) \
+  __sanitizer_syscall_pre_impl_capset((long)(header), (long)(data))
+#define __sanitizer_syscall_post_capset(res, header, data) \
+  __sanitizer_syscall_post_impl_capset(res, (long)(header), (long)(data))
+#define __sanitizer_syscall_pre_personality(personality) \
+  __sanitizer_syscall_pre_impl_personality((long)(personality))
+#define __sanitizer_syscall_post_personality(res, personality) \
+  __sanitizer_syscall_post_impl_personality(res, (long)(personality))
+#define __sanitizer_syscall_pre_sigpending(set) \
+  __sanitizer_syscall_pre_impl_sigpending((long)(set))
+#define __sanitizer_syscall_post_sigpending(res, set) \
+  __sanitizer_syscall_post_impl_sigpending(res, (long)(set))
+#define __sanitizer_syscall_pre_sigprocmask(how, set, oset)          \
+  __sanitizer_syscall_pre_impl_sigprocmask((long)(how), (long)(set), \
+                                           (long)(oset))
+#define __sanitizer_syscall_post_sigprocmask(res, how, set, oset)          \
+  __sanitizer_syscall_post_impl_sigprocmask(res, (long)(how), (long)(set), \
+                                            (long)(oset))
+#define __sanitizer_syscall_pre_getitimer(which, value) \
+  __sanitizer_syscall_pre_impl_getitimer((long)(which), (long)(value))
+#define __sanitizer_syscall_post_getitimer(res, which, value) \
+  __sanitizer_syscall_post_impl_getitimer(res, (long)(which), (long)(value))
+#define __sanitizer_syscall_pre_setitimer(which, value, ovalue)        \
+  __sanitizer_syscall_pre_impl_setitimer((long)(which), (long)(value), \
+                                         (long)(ovalue))
+#define __sanitizer_syscall_post_setitimer(res, which, value, ovalue)        \
+  __sanitizer_syscall_post_impl_setitimer(res, (long)(which), (long)(value), \
+                                          (long)(ovalue))
+#define __sanitizer_syscall_pre_timer_create(which_clock, timer_event_spec, \
+                                             created_timer_id)              \
+  __sanitizer_syscall_pre_impl_timer_create(                                \
+      (long)(which_clock), (long)(timer_event_spec), (long)(created_timer_id))
+#define __sanitizer_syscall_post_timer_create(                         \
+    res, which_clock, timer_event_spec, created_timer_id)              \
+  __sanitizer_syscall_post_impl_timer_create(res, (long)(which_clock), \
+                                             (long)(timer_event_spec), \
+                                             (long)(created_timer_id))
+#define __sanitizer_syscall_pre_timer_gettime(timer_id, setting) \
+  __sanitizer_syscall_pre_impl_timer_gettime((long)(timer_id), (long)(setting))
+#define __sanitizer_syscall_post_timer_gettime(res, timer_id, setting) \
+  __sanitizer_syscall_post_impl_timer_gettime(res, (long)(timer_id),   \
+                                              (long)(setting))
+#define __sanitizer_syscall_pre_timer_getoverrun(timer_id) \
+  __sanitizer_syscall_pre_impl_timer_getoverrun((long)(timer_id))
+#define __sanitizer_syscall_post_timer_getoverrun(res, timer_id) \
+  __sanitizer_syscall_post_impl_timer_getoverrun(res, (long)(timer_id))
+#define __sanitizer_syscall_pre_timer_settime(timer_id, flags, new_setting,   \
+                                              old_setting)                    \
+  __sanitizer_syscall_pre_impl_timer_settime((long)(timer_id), (long)(flags), \
+                                             (long)(new_setting),             \
+                                             (long)(old_setting))
+#define __sanitizer_syscall_post_timer_settime(res, timer_id, flags,     \
+                                               new_setting, old_setting) \
+  __sanitizer_syscall_post_impl_timer_settime(                           \
+      res, (long)(timer_id), (long)(flags), (long)(new_setting),         \
+      (long)(old_setting))
+#define __sanitizer_syscall_pre_timer_delete(timer_id) \
+  __sanitizer_syscall_pre_impl_timer_delete((long)(timer_id))
+#define __sanitizer_syscall_post_timer_delete(res, timer_id) \
+  __sanitizer_syscall_post_impl_timer_delete(res, (long)(timer_id))
+#define __sanitizer_syscall_pre_clock_settime(which_clock, tp) \
+  __sanitizer_syscall_pre_impl_clock_settime((long)(which_clock), (long)(tp))
+#define __sanitizer_syscall_post_clock_settime(res, which_clock, tp)    \
+  __sanitizer_syscall_post_impl_clock_settime(res, (long)(which_clock), \
+                                              (long)(tp))
+#define __sanitizer_syscall_pre_clock_gettime(which_clock, tp) \
+  __sanitizer_syscall_pre_impl_clock_gettime((long)(which_clock), (long)(tp))
+#define __sanitizer_syscall_post_clock_gettime(res, which_clock, tp)    \
+  __sanitizer_syscall_post_impl_clock_gettime(res, (long)(which_clock), \
+                                              (long)(tp))
+#define __sanitizer_syscall_pre_clock_adjtime(which_clock, tx) \
+  __sanitizer_syscall_pre_impl_clock_adjtime((long)(which_clock), (long)(tx))
+#define __sanitizer_syscall_post_clock_adjtime(res, which_clock, tx)    \
+  __sanitizer_syscall_post_impl_clock_adjtime(res, (long)(which_clock), \
+                                              (long)(tx))
+#define __sanitizer_syscall_pre_clock_getres(which_clock, tp) \
+  __sanitizer_syscall_pre_impl_clock_getres((long)(which_clock), (long)(tp))
+#define __sanitizer_syscall_post_clock_getres(res, which_clock, tp)    \
+  __sanitizer_syscall_post_impl_clock_getres(res, (long)(which_clock), \
+                                             (long)(tp))
+#define __sanitizer_syscall_pre_clock_nanosleep(which_clock, flags, rqtp, \
+                                                rmtp)                     \
+  __sanitizer_syscall_pre_impl_clock_nanosleep(                           \
+      (long)(which_clock), (long)(flags), (long)(rqtp), (long)(rmtp))
+#define __sanitizer_syscall_post_clock_nanosleep(res, which_clock, flags, \
+                                                 rqtp, rmtp)              \
+  __sanitizer_syscall_post_impl_clock_nanosleep(                          \
+      res, (long)(which_clock), (long)(flags), (long)(rqtp), (long)(rmtp))
+#define __sanitizer_syscall_pre_nice(increment) \
+  __sanitizer_syscall_pre_impl_nice((long)(increment))
+#define __sanitizer_syscall_post_nice(res, increment) \
+  __sanitizer_syscall_post_impl_nice(res, (long)(increment))
+#define __sanitizer_syscall_pre_sched_setscheduler(pid, policy, param)         \
+  __sanitizer_syscall_pre_impl_sched_setscheduler((long)(pid), (long)(policy), \
+                                                  (long)(param))
+#define __sanitizer_syscall_post_sched_setscheduler(res, pid, policy, param) \
+  __sanitizer_syscall_post_impl_sched_setscheduler(                          \
+      res, (long)(pid), (long)(policy), (long)(param))
+#define __sanitizer_syscall_pre_sched_setparam(pid, param) \
+  __sanitizer_syscall_pre_impl_sched_setparam((long)(pid), (long)(param))
+#define __sanitizer_syscall_post_sched_setparam(res, pid, param) \
+  __sanitizer_syscall_post_impl_sched_setparam(res, (long)(pid), (long)(param))
+#define __sanitizer_syscall_pre_sched_getscheduler(pid) \
+  __sanitizer_syscall_pre_impl_sched_getscheduler((long)(pid))
+#define __sanitizer_syscall_post_sched_getscheduler(res, pid) \
+  __sanitizer_syscall_post_impl_sched_getscheduler(res, (long)(pid))
+#define __sanitizer_syscall_pre_sched_getparam(pid, param) \
+  __sanitizer_syscall_pre_impl_sched_getparam((long)(pid), (long)(param))
+#define __sanitizer_syscall_post_sched_getparam(res, pid, param) \
+  __sanitizer_syscall_post_impl_sched_getparam(res, (long)(pid), (long)(param))
+#define __sanitizer_syscall_pre_sched_setaffinity(pid, len, user_mask_ptr) \
+  __sanitizer_syscall_pre_impl_sched_setaffinity((long)(pid), (long)(len), \
+                                                 (long)(user_mask_ptr))
+#define __sanitizer_syscall_post_sched_setaffinity(res, pid, len, \
+                                                   user_mask_ptr) \
+  __sanitizer_syscall_post_impl_sched_setaffinity(                \
+      res, (long)(pid), (long)(len), (long)(user_mask_ptr))
+#define __sanitizer_syscall_pre_sched_getaffinity(pid, len, user_mask_ptr) \
+  __sanitizer_syscall_pre_impl_sched_getaffinity((long)(pid), (long)(len), \
+                                                 (long)(user_mask_ptr))
+#define __sanitizer_syscall_post_sched_getaffinity(res, pid, len, \
+                                                   user_mask_ptr) \
+  __sanitizer_syscall_post_impl_sched_getaffinity(                \
+      res, (long)(pid), (long)(len), (long)(user_mask_ptr))
+#define __sanitizer_syscall_pre_sched_yield() \
+  __sanitizer_syscall_pre_impl_sched_yield()
+#define __sanitizer_syscall_post_sched_yield(res) \
+  __sanitizer_syscall_post_impl_sched_yield(res)
+#define __sanitizer_syscall_pre_sched_get_priority_max(policy) \
+  __sanitizer_syscall_pre_impl_sched_get_priority_max((long)(policy))
+#define __sanitizer_syscall_post_sched_get_priority_max(res, policy) \
+  __sanitizer_syscall_post_impl_sched_get_priority_max(res, (long)(policy))
+#define __sanitizer_syscall_pre_sched_get_priority_min(policy) \
+  __sanitizer_syscall_pre_impl_sched_get_priority_min((long)(policy))
+#define __sanitizer_syscall_post_sched_get_priority_min(res, policy) \
+  __sanitizer_syscall_post_impl_sched_get_priority_min(res, (long)(policy))
+#define __sanitizer_syscall_pre_sched_rr_get_interval(pid, interval) \
+  __sanitizer_syscall_pre_impl_sched_rr_get_interval((long)(pid),    \
+                                                     (long)(interval))
+#define __sanitizer_syscall_post_sched_rr_get_interval(res, pid, interval) \
+  __sanitizer_syscall_post_impl_sched_rr_get_interval(res, (long)(pid),    \
+                                                      (long)(interval))
+#define __sanitizer_syscall_pre_setpriority(which, who, niceval)       \
+  __sanitizer_syscall_pre_impl_setpriority((long)(which), (long)(who), \
+                                           (long)(niceval))
+#define __sanitizer_syscall_post_setpriority(res, which, who, niceval)       \
+  __sanitizer_syscall_post_impl_setpriority(res, (long)(which), (long)(who), \
+                                            (long)(niceval))
+#define __sanitizer_syscall_pre_getpriority(which, who) \
+  __sanitizer_syscall_pre_impl_getpriority((long)(which), (long)(who))
+#define __sanitizer_syscall_post_getpriority(res, which, who) \
+  __sanitizer_syscall_post_impl_getpriority(res, (long)(which), (long)(who))
+#define __sanitizer_syscall_pre_shutdown(arg0, arg1) \
+  __sanitizer_syscall_pre_impl_shutdown((long)(arg0), (long)(arg1))
+#define __sanitizer_syscall_post_shutdown(res, arg0, arg1) \
+  __sanitizer_syscall_post_impl_shutdown(res, (long)(arg0), (long)(arg1))
+#define __sanitizer_syscall_pre_reboot(magic1, magic2, cmd, arg)      \
+  __sanitizer_syscall_pre_impl_reboot((long)(magic1), (long)(magic2), \
+                                      (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_post_reboot(res, magic1, magic2, cmd, arg)      \
+  __sanitizer_syscall_post_impl_reboot(res, (long)(magic1), (long)(magic2), \
+                                       (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_pre_restart_syscall() \
+  __sanitizer_syscall_pre_impl_restart_syscall()
+#define __sanitizer_syscall_post_restart_syscall(res) \
+  __sanitizer_syscall_post_impl_restart_syscall(res)
+#define __sanitizer_syscall_pre_kexec_load(entry, nr_segments, segments,      \
+                                           flags)                             \
+  __sanitizer_syscall_pre_impl_kexec_load((long)(entry), (long)(nr_segments), \
+                                          (long)(segments), (long)(flags))
+#define __sanitizer_syscall_post_kexec_load(res, entry, nr_segments, segments, \
+                                            flags)                             \
+  __sanitizer_syscall_post_impl_kexec_load(res, (long)(entry),                 \
+                                           (long)(nr_segments),                \
+                                           (long)(segments), (long)(flags))
+#define __sanitizer_syscall_pre_exit(error_code) \
+  __sanitizer_syscall_pre_impl_exit((long)(error_code))
+#define __sanitizer_syscall_post_exit(res, error_code) \
+  __sanitizer_syscall_post_impl_exit(res, (long)(error_code))
+#define __sanitizer_syscall_pre_exit_group(error_code) \
+  __sanitizer_syscall_pre_impl_exit_group((long)(error_code))
+#define __sanitizer_syscall_post_exit_group(res, error_code) \
+  __sanitizer_syscall_post_impl_exit_group(res, (long)(error_code))
+#define __sanitizer_syscall_pre_wait4(pid, stat_addr, options, ru)   \
+  __sanitizer_syscall_pre_impl_wait4((long)(pid), (long)(stat_addr), \
+                                     (long)(options), (long)(ru))
+#define __sanitizer_syscall_post_wait4(res, pid, stat_addr, options, ru)   \
+  __sanitizer_syscall_post_impl_wait4(res, (long)(pid), (long)(stat_addr), \
+                                      (long)(options), (long)(ru))
+#define __sanitizer_syscall_pre_waitid(which, pid, infop, options, ru) \
+  __sanitizer_syscall_pre_impl_waitid(                                 \
+      (long)(which), (long)(pid), (long)(infop), (long)(options), (long)(ru))
+#define __sanitizer_syscall_post_waitid(res, which, pid, infop, options, ru) \
+  __sanitizer_syscall_post_impl_waitid(res, (long)(which), (long)(pid),      \
+                                       (long)(infop), (long)(options),       \
+                                       (long)(ru))
+#define __sanitizer_syscall_pre_waitpid(pid, stat_addr, options)       \
+  __sanitizer_syscall_pre_impl_waitpid((long)(pid), (long)(stat_addr), \
+                                       (long)(options))
+#define __sanitizer_syscall_post_waitpid(res, pid, stat_addr, options)       \
+  __sanitizer_syscall_post_impl_waitpid(res, (long)(pid), (long)(stat_addr), \
+                                        (long)(options))
+#define __sanitizer_syscall_pre_set_tid_address(tidptr) \
+  __sanitizer_syscall_pre_impl_set_tid_address((long)(tidptr))
+#define __sanitizer_syscall_post_set_tid_address(res, tidptr) \
+  __sanitizer_syscall_post_impl_set_tid_address(res, (long)(tidptr))
+#define __sanitizer_syscall_pre_init_module(umod, len, uargs)         \
+  __sanitizer_syscall_pre_impl_init_module((long)(umod), (long)(len), \
+                                           (long)(uargs))
+#define __sanitizer_syscall_post_init_module(res, umod, len, uargs)         \
+  __sanitizer_syscall_post_impl_init_module(res, (long)(umod), (long)(len), \
+                                            (long)(uargs))
+#define __sanitizer_syscall_pre_delete_module(name_user, flags) \
+  __sanitizer_syscall_pre_impl_delete_module((long)(name_user), (long)(flags))
+#define __sanitizer_syscall_post_delete_module(res, name_user, flags) \
+  __sanitizer_syscall_post_impl_delete_module(res, (long)(name_user), \
+                                              (long)(flags))
+#define __sanitizer_syscall_pre_rt_sigprocmask(how, set, oset, sigsetsize) \
+  __sanitizer_syscall_pre_impl_rt_sigprocmask(                             \
+      (long)(how), (long)(set), (long)(oset), (long)(sigsetsize))
+#define __sanitizer_syscall_post_rt_sigprocmask(res, how, set, oset, \
+                                                sigsetsize)          \
+  __sanitizer_syscall_post_impl_rt_sigprocmask(                      \
+      res, (long)(how), (long)(set), (long)(oset), (long)(sigsetsize))
+#define __sanitizer_syscall_pre_rt_sigpending(set, sigsetsize) \
+  __sanitizer_syscall_pre_impl_rt_sigpending((long)(set), (long)(sigsetsize))
+#define __sanitizer_syscall_post_rt_sigpending(res, set, sigsetsize) \
+  __sanitizer_syscall_post_impl_rt_sigpending(res, (long)(set),      \
+                                              (long)(sigsetsize))
+#define __sanitizer_syscall_pre_rt_sigtimedwait(uthese, uinfo, uts, \
+                                                sigsetsize)         \
+  __sanitizer_syscall_pre_impl_rt_sigtimedwait(                     \
+      (long)(uthese), (long)(uinfo), (long)(uts), (long)(sigsetsize))
+#define __sanitizer_syscall_post_rt_sigtimedwait(res, uthese, uinfo, uts, \
+                                                 sigsetsize)              \
+  __sanitizer_syscall_post_impl_rt_sigtimedwait(                          \
+      res, (long)(uthese), (long)(uinfo), (long)(uts), (long)(sigsetsize))
+#define __sanitizer_syscall_pre_rt_tgsigqueueinfo(tgid, pid, sig, uinfo)    \
+  __sanitizer_syscall_pre_impl_rt_tgsigqueueinfo((long)(tgid), (long)(pid), \
+                                                 (long)(sig), (long)(uinfo))
+#define __sanitizer_syscall_post_rt_tgsigqueueinfo(res, tgid, pid, sig, uinfo) \
+  __sanitizer_syscall_post_impl_rt_tgsigqueueinfo(                             \
+      res, (long)(tgid), (long)(pid), (long)(sig), (long)(uinfo))
+#define __sanitizer_syscall_pre_kill(pid, sig) \
+  __sanitizer_syscall_pre_impl_kill((long)(pid), (long)(sig))
+#define __sanitizer_syscall_post_kill(res, pid, sig) \
+  __sanitizer_syscall_post_impl_kill(res, (long)(pid), (long)(sig))
+#define __sanitizer_syscall_pre_tgkill(tgid, pid, sig) \
+  __sanitizer_syscall_pre_impl_tgkill((long)(tgid), (long)(pid), (long)(sig))
+#define __sanitizer_syscall_post_tgkill(res, tgid, pid, sig)           \
+  __sanitizer_syscall_post_impl_tgkill(res, (long)(tgid), (long)(pid), \
+                                       (long)(sig))
+#define __sanitizer_syscall_pre_tkill(pid, sig) \
+  __sanitizer_syscall_pre_impl_tkill((long)(pid), (long)(sig))
+#define __sanitizer_syscall_post_tkill(res, pid, sig) \
+  __sanitizer_syscall_post_impl_tkill(res, (long)(pid), (long)(sig))
+#define __sanitizer_syscall_pre_rt_sigqueueinfo(pid, sig, uinfo)         \
+  __sanitizer_syscall_pre_impl_rt_sigqueueinfo((long)(pid), (long)(sig), \
+                                               (long)(uinfo))
+#define __sanitizer_syscall_post_rt_sigqueueinfo(res, pid, sig, uinfo)         \
+  __sanitizer_syscall_post_impl_rt_sigqueueinfo(res, (long)(pid), (long)(sig), \
+                                                (long)(uinfo))
+#define __sanitizer_syscall_pre_sgetmask() \
+  __sanitizer_syscall_pre_impl_sgetmask()
+#define __sanitizer_syscall_post_sgetmask(res) \
+  __sanitizer_syscall_post_impl_sgetmask(res)
+#define __sanitizer_syscall_pre_ssetmask(newmask) \
+  __sanitizer_syscall_pre_impl_ssetmask((long)(newmask))
+#define __sanitizer_syscall_post_ssetmask(res, newmask) \
+  __sanitizer_syscall_post_impl_ssetmask(res, (long)(newmask))
+#define __sanitizer_syscall_pre_signal(sig, handler) \
+  __sanitizer_syscall_pre_impl_signal((long)(sig), (long)(handler))
+#define __sanitizer_syscall_post_signal(res, sig, handler) \
+  __sanitizer_syscall_post_impl_signal(res, (long)(sig), (long)(handler))
+#define __sanitizer_syscall_pre_pause() __sanitizer_syscall_pre_impl_pause()
+#define __sanitizer_syscall_post_pause(res) \
+  __sanitizer_syscall_post_impl_pause(res)
+#define __sanitizer_syscall_pre_sync() __sanitizer_syscall_pre_impl_sync()
+#define __sanitizer_syscall_post_sync(res) \
+  __sanitizer_syscall_post_impl_sync(res)
+#define __sanitizer_syscall_pre_fsync(fd) \
+  __sanitizer_syscall_pre_impl_fsync((long)(fd))
+#define __sanitizer_syscall_post_fsync(res, fd) \
+  __sanitizer_syscall_post_impl_fsync(res, (long)(fd))
+#define __sanitizer_syscall_pre_fdatasync(fd) \
+  __sanitizer_syscall_pre_impl_fdatasync((long)(fd))
+#define __sanitizer_syscall_post_fdatasync(res, fd) \
+  __sanitizer_syscall_post_impl_fdatasync(res, (long)(fd))
+#define __sanitizer_syscall_pre_bdflush(func, data) \
+  __sanitizer_syscall_pre_impl_bdflush((long)(func), (long)(data))
+#define __sanitizer_syscall_post_bdflush(res, func, data) \
+  __sanitizer_syscall_post_impl_bdflush(res, (long)(func), (long)(data))
+#define __sanitizer_syscall_pre_mount(dev_name, dir_name, type, flags, data) \
+  __sanitizer_syscall_pre_impl_mount((long)(dev_name), (long)(dir_name),     \
+                                     (long)(type), (long)(flags),            \
+                                     (long)(data))
+#define __sanitizer_syscall_post_mount(res, dev_name, dir_name, type, flags,   \
+                                       data)                                   \
+  __sanitizer_syscall_post_impl_mount(res, (long)(dev_name), (long)(dir_name), \
+                                      (long)(type), (long)(flags),             \
+                                      (long)(data))
+#define __sanitizer_syscall_pre_umount(name, flags) \
+  __sanitizer_syscall_pre_impl_umount((long)(name), (long)(flags))
+#define __sanitizer_syscall_post_umount(res, name, flags) \
+  __sanitizer_syscall_post_impl_umount(res, (long)(name), (long)(flags))
+#define __sanitizer_syscall_pre_oldumount(name) \
+  __sanitizer_syscall_pre_impl_oldumount((long)(name))
+#define __sanitizer_syscall_post_oldumount(res, name) \
+  __sanitizer_syscall_post_impl_oldumount(res, (long)(name))
+#define __sanitizer_syscall_pre_truncate(path, length) \
+  __sanitizer_syscall_pre_impl_truncate((long)(path), (long)(length))
+#define __sanitizer_syscall_post_truncate(res, path, length) \
+  __sanitizer_syscall_post_impl_truncate(res, (long)(path), (long)(length))
+#define __sanitizer_syscall_pre_ftruncate(fd, length) \
+  __sanitizer_syscall_pre_impl_ftruncate((long)(fd), (long)(length))
+#define __sanitizer_syscall_post_ftruncate(res, fd, length) \
+  __sanitizer_syscall_post_impl_ftruncate(res, (long)(fd), (long)(length))
+#define __sanitizer_syscall_pre_stat(filename, statbuf) \
+  __sanitizer_syscall_pre_impl_stat((long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_post_stat(res, filename, statbuf) \
+  __sanitizer_syscall_post_impl_stat(res, (long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_pre_statfs(path, buf) \
+  __sanitizer_syscall_pre_impl_statfs((long)(path), (long)(buf))
+#define __sanitizer_syscall_post_statfs(res, path, buf) \
+  __sanitizer_syscall_post_impl_statfs(res, (long)(path), (long)(buf))
+#define __sanitizer_syscall_pre_statfs64(path, sz, buf) \
+  __sanitizer_syscall_pre_impl_statfs64((long)(path), (long)(sz), (long)(buf))
+#define __sanitizer_syscall_post_statfs64(res, path, sz, buf)           \
+  __sanitizer_syscall_post_impl_statfs64(res, (long)(path), (long)(sz), \
+                                         (long)(buf))
+#define __sanitizer_syscall_pre_fstatfs(fd, buf) \
+  __sanitizer_syscall_pre_impl_fstatfs((long)(fd), (long)(buf))
+#define __sanitizer_syscall_post_fstatfs(res, fd, buf) \
+  __sanitizer_syscall_post_impl_fstatfs(res, (long)(fd), (long)(buf))
+#define __sanitizer_syscall_pre_fstatfs64(fd, sz, buf) \
+  __sanitizer_syscall_pre_impl_fstatfs64((long)(fd), (long)(sz), (long)(buf))
+#define __sanitizer_syscall_post_fstatfs64(res, fd, sz, buf)           \
+  __sanitizer_syscall_post_impl_fstatfs64(res, (long)(fd), (long)(sz), \
+                                          (long)(buf))
+#define __sanitizer_syscall_pre_lstat(filename, statbuf) \
+  __sanitizer_syscall_pre_impl_lstat((long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_post_lstat(res, filename, statbuf) \
+  __sanitizer_syscall_post_impl_lstat(res, (long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_pre_fstat(fd, statbuf) \
+  __sanitizer_syscall_pre_impl_fstat((long)(fd), (long)(statbuf))
+#define __sanitizer_syscall_post_fstat(res, fd, statbuf) \
+  __sanitizer_syscall_post_impl_fstat(res, (long)(fd), (long)(statbuf))
+#define __sanitizer_syscall_pre_newstat(filename, statbuf) \
+  __sanitizer_syscall_pre_impl_newstat((long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_post_newstat(res, filename, statbuf) \
+  __sanitizer_syscall_post_impl_newstat(res, (long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_pre_newlstat(filename, statbuf) \
+  __sanitizer_syscall_pre_impl_newlstat((long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_post_newlstat(res, filename, statbuf) \
+  __sanitizer_syscall_post_impl_newlstat(res, (long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_pre_newfstat(fd, statbuf) \
+  __sanitizer_syscall_pre_impl_newfstat((long)(fd), (long)(statbuf))
+#define __sanitizer_syscall_post_newfstat(res, fd, statbuf) \
+  __sanitizer_syscall_post_impl_newfstat(res, (long)(fd), (long)(statbuf))
+#define __sanitizer_syscall_pre_ustat(dev, ubuf) \
+  __sanitizer_syscall_pre_impl_ustat((long)(dev), (long)(ubuf))
+#define __sanitizer_syscall_post_ustat(res, dev, ubuf) \
+  __sanitizer_syscall_post_impl_ustat(res, (long)(dev), (long)(ubuf))
+#define __sanitizer_syscall_pre_stat64(filename, statbuf) \
+  __sanitizer_syscall_pre_impl_stat64((long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_post_stat64(res, filename, statbuf) \
+  __sanitizer_syscall_post_impl_stat64(res, (long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_pre_fstat64(fd, statbuf) \
+  __sanitizer_syscall_pre_impl_fstat64((long)(fd), (long)(statbuf))
+#define __sanitizer_syscall_post_fstat64(res, fd, statbuf) \
+  __sanitizer_syscall_post_impl_fstat64(res, (long)(fd), (long)(statbuf))
+#define __sanitizer_syscall_pre_lstat64(filename, statbuf) \
+  __sanitizer_syscall_pre_impl_lstat64((long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_post_lstat64(res, filename, statbuf) \
+  __sanitizer_syscall_post_impl_lstat64(res, (long)(filename), (long)(statbuf))
+#define __sanitizer_syscall_pre_setxattr(path, name, value, size, flags) \
+  __sanitizer_syscall_pre_impl_setxattr(                                 \
+      (long)(path), (long)(name), (long)(value), (long)(size), (long)(flags))
+#define __sanitizer_syscall_post_setxattr(res, path, name, value, size, flags) \
+  __sanitizer_syscall_post_impl_setxattr(res, (long)(path), (long)(name),      \
+                                         (long)(value), (long)(size),          \
+                                         (long)(flags))
+#define __sanitizer_syscall_pre_lsetxattr(path, name, value, size, flags) \
+  __sanitizer_syscall_pre_impl_lsetxattr(                                 \
+      (long)(path), (long)(name), (long)(value), (long)(size), (long)(flags))
+#define __sanitizer_syscall_post_lsetxattr(res, path, name, value, size,   \
+                                           flags)                          \
+  __sanitizer_syscall_post_impl_lsetxattr(res, (long)(path), (long)(name), \
+                                          (long)(value), (long)(size),     \
+                                          (long)(flags))
+#define __sanitizer_syscall_pre_fsetxattr(fd, name, value, size, flags) \
+  __sanitizer_syscall_pre_impl_fsetxattr(                               \
+      (long)(fd), (long)(name), (long)(value), (long)(size), (long)(flags))
+#define __sanitizer_syscall_post_fsetxattr(res, fd, name, value, size, flags) \
+  __sanitizer_syscall_post_impl_fsetxattr(res, (long)(fd), (long)(name),      \
+                                          (long)(value), (long)(size),        \
+                                          (long)(flags))
+#define __sanitizer_syscall_pre_getxattr(path, name, value, size)   \
+  __sanitizer_syscall_pre_impl_getxattr((long)(path), (long)(name), \
+                                        (long)(value), (long)(size))
+#define __sanitizer_syscall_post_getxattr(res, path, name, value, size)   \
+  __sanitizer_syscall_post_impl_getxattr(res, (long)(path), (long)(name), \
+                                         (long)(value), (long)(size))
+#define __sanitizer_syscall_pre_lgetxattr(path, name, value, size)   \
+  __sanitizer_syscall_pre_impl_lgetxattr((long)(path), (long)(name), \
+                                         (long)(value), (long)(size))
+#define __sanitizer_syscall_post_lgetxattr(res, path, name, value, size)   \
+  __sanitizer_syscall_post_impl_lgetxattr(res, (long)(path), (long)(name), \
+                                          (long)(value), (long)(size))
+#define __sanitizer_syscall_pre_fgetxattr(fd, name, value, size)   \
+  __sanitizer_syscall_pre_impl_fgetxattr((long)(fd), (long)(name), \
+                                         (long)(value), (long)(size))
+#define __sanitizer_syscall_post_fgetxattr(res, fd, name, value, size)   \
+  __sanitizer_syscall_post_impl_fgetxattr(res, (long)(fd), (long)(name), \
+                                          (long)(value), (long)(size))
+#define __sanitizer_syscall_pre_listxattr(path, list, size)          \
+  __sanitizer_syscall_pre_impl_listxattr((long)(path), (long)(list), \
+                                         (long)(size))
+#define __sanitizer_syscall_post_listxattr(res, path, list, size)          \
+  __sanitizer_syscall_post_impl_listxattr(res, (long)(path), (long)(list), \
+                                          (long)(size))
+#define __sanitizer_syscall_pre_llistxattr(path, list, size)          \
+  __sanitizer_syscall_pre_impl_llistxattr((long)(path), (long)(list), \
+                                          (long)(size))
+#define __sanitizer_syscall_post_llistxattr(res, path, list, size)          \
+  __sanitizer_syscall_post_impl_llistxattr(res, (long)(path), (long)(list), \
+                                           (long)(size))
+#define __sanitizer_syscall_pre_flistxattr(fd, list, size)          \
+  __sanitizer_syscall_pre_impl_flistxattr((long)(fd), (long)(list), \
+                                          (long)(size))
+#define __sanitizer_syscall_post_flistxattr(res, fd, list, size)          \
+  __sanitizer_syscall_post_impl_flistxattr(res, (long)(fd), (long)(list), \
+                                           (long)(size))
+#define __sanitizer_syscall_pre_removexattr(path, name) \
+  __sanitizer_syscall_pre_impl_removexattr((long)(path), (long)(name))
+#define __sanitizer_syscall_post_removexattr(res, path, name) \
+  __sanitizer_syscall_post_impl_removexattr(res, (long)(path), (long)(name))
+#define __sanitizer_syscall_pre_lremovexattr(path, name) \
+  __sanitizer_syscall_pre_impl_lremovexattr((long)(path), (long)(name))
+#define __sanitizer_syscall_post_lremovexattr(res, path, name) \
+  __sanitizer_syscall_post_impl_lremovexattr(res, (long)(path), (long)(name))
+#define __sanitizer_syscall_pre_fremovexattr(fd, name) \
+  __sanitizer_syscall_pre_impl_fremovexattr((long)(fd), (long)(name))
+#define __sanitizer_syscall_post_fremovexattr(res, fd, name) \
+  __sanitizer_syscall_post_impl_fremovexattr(res, (long)(fd), (long)(name))
+#define __sanitizer_syscall_pre_brk(brk) \
+  __sanitizer_syscall_pre_impl_brk((long)(brk))
+#define __sanitizer_syscall_post_brk(res, brk) \
+  __sanitizer_syscall_post_impl_brk(res, (long)(brk))
+#define __sanitizer_syscall_pre_mprotect(start, len, prot)          \
+  __sanitizer_syscall_pre_impl_mprotect((long)(start), (long)(len), \
+                                        (long)(prot))
+#define __sanitizer_syscall_post_mprotect(res, start, len, prot)          \
+  __sanitizer_syscall_post_impl_mprotect(res, (long)(start), (long)(len), \
+                                         (long)(prot))
+#define __sanitizer_syscall_pre_mremap(addr, old_len, new_len, flags, \
+                                       new_addr)                      \
+  __sanitizer_syscall_pre_impl_mremap((long)(addr), (long)(old_len),  \
+                                      (long)(new_len), (long)(flags), \
+                                      (long)(new_addr))
+#define __sanitizer_syscall_post_mremap(res, addr, old_len, new_len, flags, \
+                                        new_addr)                           \
+  __sanitizer_syscall_post_impl_mremap(res, (long)(addr), (long)(old_len),  \
+                                       (long)(new_len), (long)(flags),      \
+                                       (long)(new_addr))
+#define __sanitizer_syscall_pre_remap_file_pages(start, size, prot, pgoff, \
+                                                 flags)                    \
+  __sanitizer_syscall_pre_impl_remap_file_pages(                           \
+      (long)(start), (long)(size), (long)(prot), (long)(pgoff), (long)(flags))
+#define __sanitizer_syscall_post_remap_file_pages(res, start, size, prot,    \
+                                                  pgoff, flags)              \
+  __sanitizer_syscall_post_impl_remap_file_pages(res, (long)(start),         \
+                                                 (long)(size), (long)(prot), \
+                                                 (long)(pgoff), (long)(flags))
+#define __sanitizer_syscall_pre_msync(start, len, flags) \
+  __sanitizer_syscall_pre_impl_msync((long)(start), (long)(len), (long)(flags))
+#define __sanitizer_syscall_post_msync(res, start, len, flags)         \
+  __sanitizer_syscall_post_impl_msync(res, (long)(start), (long)(len), \
+                                      (long)(flags))
+#define __sanitizer_syscall_pre_munmap(addr, len) \
+  __sanitizer_syscall_pre_impl_munmap((long)(addr), (long)(len))
+#define __sanitizer_syscall_post_munmap(res, addr, len) \
+  __sanitizer_syscall_post_impl_munmap(res, (long)(addr), (long)(len))
+#define __sanitizer_syscall_pre_mlock(start, len) \
+  __sanitizer_syscall_pre_impl_mlock((long)(start), (long)(len))
+#define __sanitizer_syscall_post_mlock(res, start, len) \
+  __sanitizer_syscall_post_impl_mlock(res, (long)(start), (long)(len))
+#define __sanitizer_syscall_pre_munlock(start, len) \
+  __sanitizer_syscall_pre_impl_munlock((long)(start), (long)(len))
+#define __sanitizer_syscall_post_munlock(res, start, len) \
+  __sanitizer_syscall_post_impl_munlock(res, (long)(start), (long)(len))
+#define __sanitizer_syscall_pre_mlockall(flags) \
+  __sanitizer_syscall_pre_impl_mlockall((long)(flags))
+#define __sanitizer_syscall_post_mlockall(res, flags) \
+  __sanitizer_syscall_post_impl_mlockall(res, (long)(flags))
+#define __sanitizer_syscall_pre_munlockall() \
+  __sanitizer_syscall_pre_impl_munlockall()
+#define __sanitizer_syscall_post_munlockall(res) \
+  __sanitizer_syscall_post_impl_munlockall(res)
+#define __sanitizer_syscall_pre_madvise(start, len, behavior)      \
+  __sanitizer_syscall_pre_impl_madvise((long)(start), (long)(len), \
+                                       (long)(behavior))
+#define __sanitizer_syscall_post_madvise(res, start, len, behavior)      \
+  __sanitizer_syscall_post_impl_madvise(res, (long)(start), (long)(len), \
+                                        (long)(behavior))
+#define __sanitizer_syscall_pre_mincore(start, len, vec) \
+  __sanitizer_syscall_pre_impl_mincore((long)(start), (long)(len), (long)(vec))
+#define __sanitizer_syscall_post_mincore(res, start, len, vec)           \
+  __sanitizer_syscall_post_impl_mincore(res, (long)(start), (long)(len), \
+                                        (long)(vec))
+#define __sanitizer_syscall_pre_pivot_root(new_root, put_old) \
+  __sanitizer_syscall_pre_impl_pivot_root((long)(new_root), (long)(put_old))
+#define __sanitizer_syscall_post_pivot_root(res, new_root, put_old) \
+  __sanitizer_syscall_post_impl_pivot_root(res, (long)(new_root),   \
+                                           (long)(put_old))
+#define __sanitizer_syscall_pre_chroot(filename) \
+  __sanitizer_syscall_pre_impl_chroot((long)(filename))
+#define __sanitizer_syscall_post_chroot(res, filename) \
+  __sanitizer_syscall_post_impl_chroot(res, (long)(filename))
+#define __sanitizer_syscall_pre_mknod(filename, mode, dev)           \
+  __sanitizer_syscall_pre_impl_mknod((long)(filename), (long)(mode), \
+                                     (long)(dev))
+#define __sanitizer_syscall_post_mknod(res, filename, mode, dev)           \
+  __sanitizer_syscall_post_impl_mknod(res, (long)(filename), (long)(mode), \
+                                      (long)(dev))
+#define __sanitizer_syscall_pre_link(oldname, newname) \
+  __sanitizer_syscall_pre_impl_link((long)(oldname), (long)(newname))
+#define __sanitizer_syscall_post_link(res, oldname, newname) \
+  __sanitizer_syscall_post_impl_link(res, (long)(oldname), (long)(newname))
+#define __sanitizer_syscall_pre_symlink(old, new_) \
+  __sanitizer_syscall_pre_impl_symlink((long)(old), (long)(new_))
+#define __sanitizer_syscall_post_symlink(res, old, new_) \
+  __sanitizer_syscall_post_impl_symlink(res, (long)(old), (long)(new_))
+#define __sanitizer_syscall_pre_unlink(pathname) \
+  __sanitizer_syscall_pre_impl_unlink((long)(pathname))
+#define __sanitizer_syscall_post_unlink(res, pathname) \
+  __sanitizer_syscall_post_impl_unlink(res, (long)(pathname))
+#define __sanitizer_syscall_pre_rename(oldname, newname) \
+  __sanitizer_syscall_pre_impl_rename((long)(oldname), (long)(newname))
+#define __sanitizer_syscall_post_rename(res, oldname, newname) \
+  __sanitizer_syscall_post_impl_rename(res, (long)(oldname), (long)(newname))
+#define __sanitizer_syscall_pre_chmod(filename, mode) \
+  __sanitizer_syscall_pre_impl_chmod((long)(filename), (long)(mode))
+#define __sanitizer_syscall_post_chmod(res, filename, mode) \
+  __sanitizer_syscall_post_impl_chmod(res, (long)(filename), (long)(mode))
+#define __sanitizer_syscall_pre_fchmod(fd, mode) \
+  __sanitizer_syscall_pre_impl_fchmod((long)(fd), (long)(mode))
+#define __sanitizer_syscall_post_fchmod(res, fd, mode) \
+  __sanitizer_syscall_post_impl_fchmod(res, (long)(fd), (long)(mode))
+#define __sanitizer_syscall_pre_fcntl(fd, cmd, arg) \
+  __sanitizer_syscall_pre_impl_fcntl((long)(fd), (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_post_fcntl(res, fd, cmd, arg) \
+  __sanitizer_syscall_post_impl_fcntl(res, (long)(fd), (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_pre_fcntl64(fd, cmd, arg) \
+  __sanitizer_syscall_pre_impl_fcntl64((long)(fd), (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_post_fcntl64(res, fd, cmd, arg)           \
+  __sanitizer_syscall_post_impl_fcntl64(res, (long)(fd), (long)(cmd), \
+                                        (long)(arg))
+#define __sanitizer_syscall_pre_pipe(fildes) \
+  __sanitizer_syscall_pre_impl_pipe((long)(fildes))
+#define __sanitizer_syscall_post_pipe(res, fildes) \
+  __sanitizer_syscall_post_impl_pipe(res, (long)(fildes))
+#define __sanitizer_syscall_pre_pipe2(fildes, flags) \
+  __sanitizer_syscall_pre_impl_pipe2((long)(fildes), (long)(flags))
+#define __sanitizer_syscall_post_pipe2(res, fildes, flags) \
+  __sanitizer_syscall_post_impl_pipe2(res, (long)(fildes), (long)(flags))
+#define __sanitizer_syscall_pre_dup(fildes) \
+  __sanitizer_syscall_pre_impl_dup((long)(fildes))
+#define __sanitizer_syscall_post_dup(res, fildes) \
+  __sanitizer_syscall_post_impl_dup(res, (long)(fildes))
+#define __sanitizer_syscall_pre_dup2(oldfd, newfd) \
+  __sanitizer_syscall_pre_impl_dup2((long)(oldfd), (long)(newfd))
+#define __sanitizer_syscall_post_dup2(res, oldfd, newfd) \
+  __sanitizer_syscall_post_impl_dup2(res, (long)(oldfd), (long)(newfd))
+#define __sanitizer_syscall_pre_dup3(oldfd, newfd, flags) \
+  __sanitizer_syscall_pre_impl_dup3((long)(oldfd), (long)(newfd), (long)(flags))
+#define __sanitizer_syscall_post_dup3(res, oldfd, newfd, flags)         \
+  __sanitizer_syscall_post_impl_dup3(res, (long)(oldfd), (long)(newfd), \
+                                     (long)(flags))
+#define __sanitizer_syscall_pre_ioperm(from, num, on) \
+  __sanitizer_syscall_pre_impl_ioperm((long)(from), (long)(num), (long)(on))
+#define __sanitizer_syscall_post_ioperm(res, from, num, on)            \
+  __sanitizer_syscall_post_impl_ioperm(res, (long)(from), (long)(num), \
+                                       (long)(on))
+#define __sanitizer_syscall_pre_ioctl(fd, cmd, arg) \
+  __sanitizer_syscall_pre_impl_ioctl((long)(fd), (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_post_ioctl(res, fd, cmd, arg) \
+  __sanitizer_syscall_post_impl_ioctl(res, (long)(fd), (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_pre_flock(fd, cmd) \
+  __sanitizer_syscall_pre_impl_flock((long)(fd), (long)(cmd))
+#define __sanitizer_syscall_post_flock(res, fd, cmd) \
+  __sanitizer_syscall_post_impl_flock(res, (long)(fd), (long)(cmd))
+#define __sanitizer_syscall_pre_io_setup(nr_reqs, ctx) \
+  __sanitizer_syscall_pre_impl_io_setup((long)(nr_reqs), (long)(ctx))
+#define __sanitizer_syscall_post_io_setup(res, nr_reqs, ctx) \
+  __sanitizer_syscall_post_impl_io_setup(res, (long)(nr_reqs), (long)(ctx))
+#define __sanitizer_syscall_pre_io_destroy(ctx) \
+  __sanitizer_syscall_pre_impl_io_destroy((long)(ctx))
+#define __sanitizer_syscall_post_io_destroy(res, ctx) \
+  __sanitizer_syscall_post_impl_io_destroy(res, (long)(ctx))
+#define __sanitizer_syscall_pre_io_getevents(ctx_id, min_nr, nr, events,    \
+                                             timeout)                       \
+  __sanitizer_syscall_pre_impl_io_getevents((long)(ctx_id), (long)(min_nr), \
+                                            (long)(nr), (long)(events),     \
+                                            (long)(timeout))
+#define __sanitizer_syscall_post_io_getevents(res, ctx_id, min_nr, nr, events, \
+                                              timeout)                         \
+  __sanitizer_syscall_post_impl_io_getevents(res, (long)(ctx_id),              \
+                                             (long)(min_nr), (long)(nr),       \
+                                             (long)(events), (long)(timeout))
+#define __sanitizer_syscall_pre_io_submit(ctx_id, arg1, arg2)          \
+  __sanitizer_syscall_pre_impl_io_submit((long)(ctx_id), (long)(arg1), \
+                                         (long)(arg2))
+#define __sanitizer_syscall_post_io_submit(res, ctx_id, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_io_submit(res, (long)(ctx_id), (long)(arg1), \
+                                          (long)(arg2))
+#define __sanitizer_syscall_pre_io_cancel(ctx_id, iocb, result)        \
+  __sanitizer_syscall_pre_impl_io_cancel((long)(ctx_id), (long)(iocb), \
+                                         (long)(result))
+#define __sanitizer_syscall_post_io_cancel(res, ctx_id, iocb, result)        \
+  __sanitizer_syscall_post_impl_io_cancel(res, (long)(ctx_id), (long)(iocb), \
+                                          (long)(result))
+#define __sanitizer_syscall_pre_sendfile(out_fd, in_fd, offset, count) \
+  __sanitizer_syscall_pre_impl_sendfile((long)(out_fd), (long)(in_fd), \
+                                        (long)(offset), (long)(count))
+#define __sanitizer_syscall_post_sendfile(res, out_fd, in_fd, offset, count) \
+  __sanitizer_syscall_post_impl_sendfile(res, (long)(out_fd), (long)(in_fd), \
+                                         (long)(offset), (long)(count))
+#define __sanitizer_syscall_pre_sendfile64(out_fd, in_fd, offset, count) \
+  __sanitizer_syscall_pre_impl_sendfile64((long)(out_fd), (long)(in_fd), \
+                                          (long)(offset), (long)(count))
+#define __sanitizer_syscall_post_sendfile64(res, out_fd, in_fd, offset, count) \
+  __sanitizer_syscall_post_impl_sendfile64(res, (long)(out_fd), (long)(in_fd), \
+                                           (long)(offset), (long)(count))
+#define __sanitizer_syscall_pre_readlink(path, buf, bufsiz)        \
+  __sanitizer_syscall_pre_impl_readlink((long)(path), (long)(buf), \
+                                        (long)(bufsiz))
+#define __sanitizer_syscall_post_readlink(res, path, buf, bufsiz)        \
+  __sanitizer_syscall_post_impl_readlink(res, (long)(path), (long)(buf), \
+                                         (long)(bufsiz))
+#define __sanitizer_syscall_pre_creat(pathname, mode) \
+  __sanitizer_syscall_pre_impl_creat((long)(pathname), (long)(mode))
+#define __sanitizer_syscall_post_creat(res, pathname, mode) \
+  __sanitizer_syscall_post_impl_creat(res, (long)(pathname), (long)(mode))
+#define __sanitizer_syscall_pre_open(filename, flags, mode)          \
+  __sanitizer_syscall_pre_impl_open((long)(filename), (long)(flags), \
+                                    (long)(mode))
+#define __sanitizer_syscall_post_open(res, filename, flags, mode)          \
+  __sanitizer_syscall_post_impl_open(res, (long)(filename), (long)(flags), \
+                                     (long)(mode))
+#define __sanitizer_syscall_pre_close(fd) \
+  __sanitizer_syscall_pre_impl_close((long)(fd))
+#define __sanitizer_syscall_post_close(res, fd) \
+  __sanitizer_syscall_post_impl_close(res, (long)(fd))
+#define __sanitizer_syscall_pre_access(filename, mode) \
+  __sanitizer_syscall_pre_impl_access((long)(filename), (long)(mode))
+#define __sanitizer_syscall_post_access(res, filename, mode) \
+  __sanitizer_syscall_post_impl_access(res, (long)(filename), (long)(mode))
+#define __sanitizer_syscall_pre_vhangup() __sanitizer_syscall_pre_impl_vhangup()
+#define __sanitizer_syscall_post_vhangup(res) \
+  __sanitizer_syscall_post_impl_vhangup(res)
+#define __sanitizer_syscall_pre_chown(filename, user, group)         \
+  __sanitizer_syscall_pre_impl_chown((long)(filename), (long)(user), \
+                                     (long)(group))
+#define __sanitizer_syscall_post_chown(res, filename, user, group)         \
+  __sanitizer_syscall_post_impl_chown(res, (long)(filename), (long)(user), \
+                                      (long)(group))
+#define __sanitizer_syscall_pre_lchown(filename, user, group)         \
+  __sanitizer_syscall_pre_impl_lchown((long)(filename), (long)(user), \
+                                      (long)(group))
+#define __sanitizer_syscall_post_lchown(res, filename, user, group)         \
+  __sanitizer_syscall_post_impl_lchown(res, (long)(filename), (long)(user), \
+                                       (long)(group))
+#define __sanitizer_syscall_pre_fchown(fd, user, group) \
+  __sanitizer_syscall_pre_impl_fchown((long)(fd), (long)(user), (long)(group))
+#define __sanitizer_syscall_post_fchown(res, fd, user, group)         \
+  __sanitizer_syscall_post_impl_fchown(res, (long)(fd), (long)(user), \
+                                       (long)(group))
+#define __sanitizer_syscall_pre_chown16(filename, user, group)       \
+  __sanitizer_syscall_pre_impl_chown16((long)(filename), (long)user, \
+                                       (long)group)
+#define __sanitizer_syscall_post_chown16(res, filename, user, group)       \
+  __sanitizer_syscall_post_impl_chown16(res, (long)(filename), (long)user, \
+                                        (long)group)
+#define __sanitizer_syscall_pre_lchown16(filename, user, group)       \
+  __sanitizer_syscall_pre_impl_lchown16((long)(filename), (long)user, \
+                                        (long)group)
+#define __sanitizer_syscall_post_lchown16(res, filename, user, group)       \
+  __sanitizer_syscall_post_impl_lchown16(res, (long)(filename), (long)user, \
+                                         (long)group)
+#define __sanitizer_syscall_pre_fchown16(fd, user, group) \
+  __sanitizer_syscall_pre_impl_fchown16((long)(fd), (long)user, (long)group)
+#define __sanitizer_syscall_post_fchown16(res, fd, user, group)       \
+  __sanitizer_syscall_post_impl_fchown16(res, (long)(fd), (long)user, \
+                                         (long)group)
+#define __sanitizer_syscall_pre_setregid16(rgid, egid) \
+  __sanitizer_syscall_pre_impl_setregid16((long)rgid, (long)egid)
+#define __sanitizer_syscall_post_setregid16(res, rgid, egid) \
+  __sanitizer_syscall_post_impl_setregid16(res, (long)rgid, (long)egid)
+#define __sanitizer_syscall_pre_setgid16(gid) \
+  __sanitizer_syscall_pre_impl_setgid16((long)gid)
+#define __sanitizer_syscall_post_setgid16(res, gid) \
+  __sanitizer_syscall_post_impl_setgid16(res, (long)gid)
+#define __sanitizer_syscall_pre_setreuid16(ruid, euid) \
+  __sanitizer_syscall_pre_impl_setreuid16((long)ruid, (long)euid)
+#define __sanitizer_syscall_post_setreuid16(res, ruid, euid) \
+  __sanitizer_syscall_post_impl_setreuid16(res, (long)ruid, (long)euid)
+#define __sanitizer_syscall_pre_setuid16(uid) \
+  __sanitizer_syscall_pre_impl_setuid16((long)uid)
+#define __sanitizer_syscall_post_setuid16(res, uid) \
+  __sanitizer_syscall_post_impl_setuid16(res, (long)uid)
+#define __sanitizer_syscall_pre_setresuid16(ruid, euid, suid) \
+  __sanitizer_syscall_pre_impl_setresuid16((long)ruid, (long)euid, (long)suid)
+#define __sanitizer_syscall_post_setresuid16(res, ruid, euid, suid)      \
+  __sanitizer_syscall_post_impl_setresuid16(res, (long)ruid, (long)euid, \
+                                            (long)suid)
+#define __sanitizer_syscall_pre_getresuid16(ruid, euid, suid)          \
+  __sanitizer_syscall_pre_impl_getresuid16((long)(ruid), (long)(euid), \
+                                           (long)(suid))
+#define __sanitizer_syscall_post_getresuid16(res, ruid, euid, suid)          \
+  __sanitizer_syscall_post_impl_getresuid16(res, (long)(ruid), (long)(euid), \
+                                            (long)(suid))
+#define __sanitizer_syscall_pre_setresgid16(rgid, egid, sgid) \
+  __sanitizer_syscall_pre_impl_setresgid16((long)rgid, (long)egid, (long)sgid)
+#define __sanitizer_syscall_post_setresgid16(res, rgid, egid, sgid)      \
+  __sanitizer_syscall_post_impl_setresgid16(res, (long)rgid, (long)egid, \
+                                            (long)sgid)
+#define __sanitizer_syscall_pre_getresgid16(rgid, egid, sgid)          \
+  __sanitizer_syscall_pre_impl_getresgid16((long)(rgid), (long)(egid), \
+                                           (long)(sgid))
+#define __sanitizer_syscall_post_getresgid16(res, rgid, egid, sgid)          \
+  __sanitizer_syscall_post_impl_getresgid16(res, (long)(rgid), (long)(egid), \
+                                            (long)(sgid))
+#define __sanitizer_syscall_pre_setfsuid16(uid) \
+  __sanitizer_syscall_pre_impl_setfsuid16((long)uid)
+#define __sanitizer_syscall_post_setfsuid16(res, uid) \
+  __sanitizer_syscall_post_impl_setfsuid16(res, (long)uid)
+#define __sanitizer_syscall_pre_setfsgid16(gid) \
+  __sanitizer_syscall_pre_impl_setfsgid16((long)gid)
+#define __sanitizer_syscall_post_setfsgid16(res, gid) \
+  __sanitizer_syscall_post_impl_setfsgid16(res, (long)gid)
+#define __sanitizer_syscall_pre_getgroups16(gidsetsize, grouplist) \
+  __sanitizer_syscall_pre_impl_getgroups16((long)(gidsetsize),     \
+                                           (long)(grouplist))
+#define __sanitizer_syscall_post_getgroups16(res, gidsetsize, grouplist) \
+  __sanitizer_syscall_post_impl_getgroups16(res, (long)(gidsetsize),     \
+                                            (long)(grouplist))
+#define __sanitizer_syscall_pre_setgroups16(gidsetsize, grouplist) \
+  __sanitizer_syscall_pre_impl_setgroups16((long)(gidsetsize),     \
+                                           (long)(grouplist))
+#define __sanitizer_syscall_post_setgroups16(res, gidsetsize, grouplist) \
+  __sanitizer_syscall_post_impl_setgroups16(res, (long)(gidsetsize),     \
+                                            (long)(grouplist))
+#define __sanitizer_syscall_pre_getuid16() \
+  __sanitizer_syscall_pre_impl_getuid16()
+#define __sanitizer_syscall_post_getuid16(res) \
+  __sanitizer_syscall_post_impl_getuid16(res)
+#define __sanitizer_syscall_pre_geteuid16() \
+  __sanitizer_syscall_pre_impl_geteuid16()
+#define __sanitizer_syscall_post_geteuid16(res) \
+  __sanitizer_syscall_post_impl_geteuid16(res)
+#define __sanitizer_syscall_pre_getgid16() \
+  __sanitizer_syscall_pre_impl_getgid16()
+#define __sanitizer_syscall_post_getgid16(res) \
+  __sanitizer_syscall_post_impl_getgid16(res)
+#define __sanitizer_syscall_pre_getegid16() \
+  __sanitizer_syscall_pre_impl_getegid16()
+#define __sanitizer_syscall_post_getegid16(res) \
+  __sanitizer_syscall_post_impl_getegid16(res)
+#define __sanitizer_syscall_pre_utime(filename, times) \
+  __sanitizer_syscall_pre_impl_utime((long)(filename), (long)(times))
+#define __sanitizer_syscall_post_utime(res, filename, times) \
+  __sanitizer_syscall_post_impl_utime(res, (long)(filename), (long)(times))
+#define __sanitizer_syscall_pre_utimes(filename, utimes) \
+  __sanitizer_syscall_pre_impl_utimes((long)(filename), (long)(utimes))
+#define __sanitizer_syscall_post_utimes(res, filename, utimes) \
+  __sanitizer_syscall_post_impl_utimes(res, (long)(filename), (long)(utimes))
+#define __sanitizer_syscall_pre_lseek(fd, offset, origin) \
+  __sanitizer_syscall_pre_impl_lseek((long)(fd), (long)(offset), (long)(origin))
+#define __sanitizer_syscall_post_lseek(res, fd, offset, origin)        \
+  __sanitizer_syscall_post_impl_lseek(res, (long)(fd), (long)(offset), \
+                                      (long)(origin))
+#define __sanitizer_syscall_pre_llseek(fd, offset_high, offset_low, result, \
+                                       origin)                              \
+  __sanitizer_syscall_pre_impl_llseek((long)(fd), (long)(offset_high),      \
+                                      (long)(offset_low), (long)(result),   \
+                                      (long)(origin))
+#define __sanitizer_syscall_post_llseek(res, fd, offset_high, offset_low,    \
+                                        result, origin)                      \
+  __sanitizer_syscall_post_impl_llseek(res, (long)(fd), (long)(offset_high), \
+                                       (long)(offset_low), (long)(result),   \
+                                       (long)(origin))
+#define __sanitizer_syscall_pre_read(fd, buf, count) \
+  __sanitizer_syscall_pre_impl_read((long)(fd), (long)(buf), (long)(count))
+#define __sanitizer_syscall_post_read(res, fd, buf, count)         \
+  __sanitizer_syscall_post_impl_read(res, (long)(fd), (long)(buf), \
+                                     (long)(count))
+#define __sanitizer_syscall_pre_readv(fd, vec, vlen) \
+  __sanitizer_syscall_pre_impl_readv((long)(fd), (long)(vec), (long)(vlen))
+#define __sanitizer_syscall_post_readv(res, fd, vec, vlen)          \
+  __sanitizer_syscall_post_impl_readv(res, (long)(fd), (long)(vec), \
+                                      (long)(vlen))
+#define __sanitizer_syscall_pre_write(fd, buf, count) \
+  __sanitizer_syscall_pre_impl_write((long)(fd), (long)(buf), (long)(count))
+#define __sanitizer_syscall_post_write(res, fd, buf, count)         \
+  __sanitizer_syscall_post_impl_write(res, (long)(fd), (long)(buf), \
+                                      (long)(count))
+#define __sanitizer_syscall_pre_writev(fd, vec, vlen) \
+  __sanitizer_syscall_pre_impl_writev((long)(fd), (long)(vec), (long)(vlen))
+#define __sanitizer_syscall_post_writev(res, fd, vec, vlen)          \
+  __sanitizer_syscall_post_impl_writev(res, (long)(fd), (long)(vec), \
+                                       (long)(vlen))
+
+#ifdef _LP64
+#define __sanitizer_syscall_pre_pread64(fd, buf, count, pos)                   \
+  __sanitizer_syscall_pre_impl_pread64((long)(fd), (long)(buf), (long)(count), \
+                                       (long)(pos))
+#define __sanitizer_syscall_post_pread64(res, fd, buf, count, pos)    \
+  __sanitizer_syscall_post_impl_pread64(res, (long)(fd), (long)(buf), \
+                                        (long)(count), (long)(pos))
+#define __sanitizer_syscall_pre_pwrite64(fd, buf, count, pos)    \
+  __sanitizer_syscall_pre_impl_pwrite64((long)(fd), (long)(buf), \
+                                        (long)(count), (long)(pos))
+#define __sanitizer_syscall_post_pwrite64(res, fd, buf, count, pos)    \
+  __sanitizer_syscall_post_impl_pwrite64(res, (long)(fd), (long)(buf), \
+                                         (long)(count), (long)(pos))
+#else
+#define __sanitizer_syscall_pre_pread64(fd, buf, count, pos0, pos1)            \
+  __sanitizer_syscall_pre_impl_pread64((long)(fd), (long)(buf), (long)(count), \
+                                       (long)(pos0), (long)(pos1))
+#define __sanitizer_syscall_post_pread64(res, fd, buf, count, pos0, pos1) \
+  __sanitizer_syscall_post_impl_pread64(res, (long)(fd), (long)(buf),     \
+                                        (long)(count), (long)(pos0), \
+                                        (long)(pos1))
+#define __sanitizer_syscall_pre_pwrite64(fd, buf, count, pos0, pos1) \
+  __sanitizer_syscall_pre_impl_pwrite64(                             \
+      (long)(fd), (long)(buf), (long)(count), (long)(pos0), (long)(pos1))
+#define __sanitizer_syscall_post_pwrite64(res, fd, buf, count, pos0, pos1) \
+  __sanitizer_syscall_post_impl_pwrite64(                                  \
+      res, (long)(fd), (long)(buf), (long)(count), (long)(pos0), (long)(pos1))
+#endif
+
+#define __sanitizer_syscall_pre_preadv(fd, vec, vlen, pos_l, pos_h)          \
+  __sanitizer_syscall_pre_impl_preadv((long)(fd), (long)(vec), (long)(vlen), \
+                                      (long)(pos_l), (long)(pos_h))
+#define __sanitizer_syscall_post_preadv(res, fd, vec, vlen, pos_l, pos_h) \
+  __sanitizer_syscall_post_impl_preadv(res, (long)(fd), (long)(vec),      \
+                                       (long)(vlen), (long)(pos_l),       \
+                                       (long)(pos_h))
+#define __sanitizer_syscall_pre_pwritev(fd, vec, vlen, pos_l, pos_h)          \
+  __sanitizer_syscall_pre_impl_pwritev((long)(fd), (long)(vec), (long)(vlen), \
+                                       (long)(pos_l), (long)(pos_h))
+#define __sanitizer_syscall_post_pwritev(res, fd, vec, vlen, pos_l, pos_h) \
+  __sanitizer_syscall_post_impl_pwritev(res, (long)(fd), (long)(vec),      \
+                                        (long)(vlen), (long)(pos_l),       \
+                                        (long)(pos_h))
+#define __sanitizer_syscall_pre_getcwd(buf, size) \
+  __sanitizer_syscall_pre_impl_getcwd((long)(buf), (long)(size))
+#define __sanitizer_syscall_post_getcwd(res, buf, size) \
+  __sanitizer_syscall_post_impl_getcwd(res, (long)(buf), (long)(size))
+#define __sanitizer_syscall_pre_mkdir(pathname, mode) \
+  __sanitizer_syscall_pre_impl_mkdir((long)(pathname), (long)(mode))
+#define __sanitizer_syscall_post_mkdir(res, pathname, mode) \
+  __sanitizer_syscall_post_impl_mkdir(res, (long)(pathname), (long)(mode))
+#define __sanitizer_syscall_pre_chdir(filename) \
+  __sanitizer_syscall_pre_impl_chdir((long)(filename))
+#define __sanitizer_syscall_post_chdir(res, filename) \
+  __sanitizer_syscall_post_impl_chdir(res, (long)(filename))
+#define __sanitizer_syscall_pre_fchdir(fd) \
+  __sanitizer_syscall_pre_impl_fchdir((long)(fd))
+#define __sanitizer_syscall_post_fchdir(res, fd) \
+  __sanitizer_syscall_post_impl_fchdir(res, (long)(fd))
+#define __sanitizer_syscall_pre_rmdir(pathname) \
+  __sanitizer_syscall_pre_impl_rmdir((long)(pathname))
+#define __sanitizer_syscall_post_rmdir(res, pathname) \
+  __sanitizer_syscall_post_impl_rmdir(res, (long)(pathname))
+#define __sanitizer_syscall_pre_lookup_dcookie(cookie64, buf, len)           \
+  __sanitizer_syscall_pre_impl_lookup_dcookie((long)(cookie64), (long)(buf), \
+                                              (long)(len))
+#define __sanitizer_syscall_post_lookup_dcookie(res, cookie64, buf, len) \
+  __sanitizer_syscall_post_impl_lookup_dcookie(res, (long)(cookie64),    \
+                                               (long)(buf), (long)(len))
+#define __sanitizer_syscall_pre_quotactl(cmd, special, id, addr)      \
+  __sanitizer_syscall_pre_impl_quotactl((long)(cmd), (long)(special), \
+                                        (long)(id), (long)(addr))
+#define __sanitizer_syscall_post_quotactl(res, cmd, special, id, addr)      \
+  __sanitizer_syscall_post_impl_quotactl(res, (long)(cmd), (long)(special), \
+                                         (long)(id), (long)(addr))
+#define __sanitizer_syscall_pre_getdents(fd, dirent, count)         \
+  __sanitizer_syscall_pre_impl_getdents((long)(fd), (long)(dirent), \
+                                        (long)(count))
+#define __sanitizer_syscall_post_getdents(res, fd, dirent, count)         \
+  __sanitizer_syscall_post_impl_getdents(res, (long)(fd), (long)(dirent), \
+                                         (long)(count))
+#define __sanitizer_syscall_pre_getdents64(fd, dirent, count)         \
+  __sanitizer_syscall_pre_impl_getdents64((long)(fd), (long)(dirent), \
+                                          (long)(count))
+#define __sanitizer_syscall_post_getdents64(res, fd, dirent, count)         \
+  __sanitizer_syscall_post_impl_getdents64(res, (long)(fd), (long)(dirent), \
+                                           (long)(count))
+#define __sanitizer_syscall_pre_setsockopt(fd, level, optname, optval, optlen) \
+  __sanitizer_syscall_pre_impl_setsockopt((long)(fd), (long)(level),           \
+                                          (long)(optname), (long)(optval),     \
+                                          (long)(optlen))
+#define __sanitizer_syscall_post_setsockopt(res, fd, level, optname, optval, \
+                                            optlen)                          \
+  __sanitizer_syscall_post_impl_setsockopt(res, (long)(fd), (long)(level),   \
+                                           (long)(optname), (long)(optval),  \
+                                           (long)(optlen))
+#define __sanitizer_syscall_pre_getsockopt(fd, level, optname, optval, optlen) \
+  __sanitizer_syscall_pre_impl_getsockopt((long)(fd), (long)(level),           \
+                                          (long)(optname), (long)(optval),     \
+                                          (long)(optlen))
+#define __sanitizer_syscall_post_getsockopt(res, fd, level, optname, optval, \
+                                            optlen)                          \
+  __sanitizer_syscall_post_impl_getsockopt(res, (long)(fd), (long)(level),   \
+                                           (long)(optname), (long)(optval),  \
+                                           (long)(optlen))
+#define __sanitizer_syscall_pre_bind(arg0, arg1, arg2) \
+  __sanitizer_syscall_pre_impl_bind((long)(arg0), (long)(arg1), (long)(arg2))
+#define __sanitizer_syscall_post_bind(res, arg0, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_bind(res, (long)(arg0), (long)(arg1), \
+                                     (long)(arg2))
+#define __sanitizer_syscall_pre_connect(arg0, arg1, arg2) \
+  __sanitizer_syscall_pre_impl_connect((long)(arg0), (long)(arg1), (long)(arg2))
+#define __sanitizer_syscall_post_connect(res, arg0, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_connect(res, (long)(arg0), (long)(arg1), \
+                                        (long)(arg2))
+#define __sanitizer_syscall_pre_accept(arg0, arg1, arg2) \
+  __sanitizer_syscall_pre_impl_accept((long)(arg0), (long)(arg1), (long)(arg2))
+#define __sanitizer_syscall_post_accept(res, arg0, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_accept(res, (long)(arg0), (long)(arg1), \
+                                       (long)(arg2))
+#define __sanitizer_syscall_pre_accept4(arg0, arg1, arg2, arg3)    \
+  __sanitizer_syscall_pre_impl_accept4((long)(arg0), (long)(arg1), \
+                                       (long)(arg2), (long)(arg3))
+#define __sanitizer_syscall_post_accept4(res, arg0, arg1, arg2, arg3)    \
+  __sanitizer_syscall_post_impl_accept4(res, (long)(arg0), (long)(arg1), \
+                                        (long)(arg2), (long)(arg3))
+#define __sanitizer_syscall_pre_getsockname(arg0, arg1, arg2)          \
+  __sanitizer_syscall_pre_impl_getsockname((long)(arg0), (long)(arg1), \
+                                           (long)(arg2))
+#define __sanitizer_syscall_post_getsockname(res, arg0, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_getsockname(res, (long)(arg0), (long)(arg1), \
+                                            (long)(arg2))
+#define __sanitizer_syscall_pre_getpeername(arg0, arg1, arg2)          \
+  __sanitizer_syscall_pre_impl_getpeername((long)(arg0), (long)(arg1), \
+                                           (long)(arg2))
+#define __sanitizer_syscall_post_getpeername(res, arg0, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_getpeername(res, (long)(arg0), (long)(arg1), \
+                                            (long)(arg2))
+#define __sanitizer_syscall_pre_send(arg0, arg1, arg2, arg3)                  \
+  __sanitizer_syscall_pre_impl_send((long)(arg0), (long)(arg1), (long)(arg2), \
+                                    (long)(arg3))
+#define __sanitizer_syscall_post_send(res, arg0, arg1, arg2, arg3)    \
+  __sanitizer_syscall_post_impl_send(res, (long)(arg0), (long)(arg1), \
+                                     (long)(arg2), (long)(arg3))
+#define __sanitizer_syscall_pre_sendto(arg0, arg1, arg2, arg3, arg4, arg5) \
+  __sanitizer_syscall_pre_impl_sendto((long)(arg0), (long)(arg1),          \
+                                      (long)(arg2), (long)(arg3),          \
+                                      (long)(arg4), (long)(arg5))
+#define __sanitizer_syscall_post_sendto(res, arg0, arg1, arg2, arg3, arg4, \
+                                        arg5)                              \
+  __sanitizer_syscall_post_impl_sendto(res, (long)(arg0), (long)(arg1),    \
+                                       (long)(arg2), (long)(arg3),         \
+                                       (long)(arg4), (long)(arg5))
+#define __sanitizer_syscall_pre_sendmsg(fd, msg, flags) \
+  __sanitizer_syscall_pre_impl_sendmsg((long)(fd), (long)(msg), (long)(flags))
+#define __sanitizer_syscall_post_sendmsg(res, fd, msg, flags)         \
+  __sanitizer_syscall_post_impl_sendmsg(res, (long)(fd), (long)(msg), \
+                                        (long)(flags))
+#define __sanitizer_syscall_pre_sendmmsg(fd, msg, vlen, flags)                 \
+  __sanitizer_syscall_pre_impl_sendmmsg((long)(fd), (long)(msg), (long)(vlen), \
+                                        (long)(flags))
+#define __sanitizer_syscall_post_sendmmsg(res, fd, msg, vlen, flags)   \
+  __sanitizer_syscall_post_impl_sendmmsg(res, (long)(fd), (long)(msg), \
+                                         (long)(vlen), (long)(flags))
+#define __sanitizer_syscall_pre_recv(arg0, arg1, arg2, arg3)                  \
+  __sanitizer_syscall_pre_impl_recv((long)(arg0), (long)(arg1), (long)(arg2), \
+                                    (long)(arg3))
+#define __sanitizer_syscall_post_recv(res, arg0, arg1, arg2, arg3)    \
+  __sanitizer_syscall_post_impl_recv(res, (long)(arg0), (long)(arg1), \
+                                     (long)(arg2), (long)(arg3))
+#define __sanitizer_syscall_pre_recvfrom(arg0, arg1, arg2, arg3, arg4, arg5) \
+  __sanitizer_syscall_pre_impl_recvfrom((long)(arg0), (long)(arg1),          \
+                                        (long)(arg2), (long)(arg3),          \
+                                        (long)(arg4), (long)(arg5))
+#define __sanitizer_syscall_post_recvfrom(res, arg0, arg1, arg2, arg3, arg4, \
+                                          arg5)                              \
+  __sanitizer_syscall_post_impl_recvfrom(res, (long)(arg0), (long)(arg1),    \
+                                         (long)(arg2), (long)(arg3),         \
+                                         (long)(arg4), (long)(arg5))
+#define __sanitizer_syscall_pre_recvmsg(fd, msg, flags) \
+  __sanitizer_syscall_pre_impl_recvmsg((long)(fd), (long)(msg), (long)(flags))
+#define __sanitizer_syscall_post_recvmsg(res, fd, msg, flags)         \
+  __sanitizer_syscall_post_impl_recvmsg(res, (long)(fd), (long)(msg), \
+                                        (long)(flags))
+#define __sanitizer_syscall_pre_recvmmsg(fd, msg, vlen, flags, timeout)        \
+  __sanitizer_syscall_pre_impl_recvmmsg((long)(fd), (long)(msg), (long)(vlen), \
+                                        (long)(flags), (long)(timeout))
+#define __sanitizer_syscall_post_recvmmsg(res, fd, msg, vlen, flags, timeout) \
+  __sanitizer_syscall_post_impl_recvmmsg(res, (long)(fd), (long)(msg),        \
+                                         (long)(vlen), (long)(flags),         \
+                                         (long)(timeout))
+#define __sanitizer_syscall_pre_socket(arg0, arg1, arg2) \
+  __sanitizer_syscall_pre_impl_socket((long)(arg0), (long)(arg1), (long)(arg2))
+#define __sanitizer_syscall_post_socket(res, arg0, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_socket(res, (long)(arg0), (long)(arg1), \
+                                       (long)(arg2))
+#define __sanitizer_syscall_pre_socketpair(arg0, arg1, arg2, arg3)    \
+  __sanitizer_syscall_pre_impl_socketpair((long)(arg0), (long)(arg1), \
+                                          (long)(arg2), (long)(arg3))
+#define __sanitizer_syscall_post_socketpair(res, arg0, arg1, arg2, arg3)    \
+  __sanitizer_syscall_post_impl_socketpair(res, (long)(arg0), (long)(arg1), \
+                                           (long)(arg2), (long)(arg3))
+#define __sanitizer_syscall_pre_socketcall(call, args) \
+  __sanitizer_syscall_pre_impl_socketcall((long)(call), (long)(args))
+#define __sanitizer_syscall_post_socketcall(res, call, args) \
+  __sanitizer_syscall_post_impl_socketcall(res, (long)(call), (long)(args))
+#define __sanitizer_syscall_pre_listen(arg0, arg1) \
+  __sanitizer_syscall_pre_impl_listen((long)(arg0), (long)(arg1))
+#define __sanitizer_syscall_post_listen(res, arg0, arg1) \
+  __sanitizer_syscall_post_impl_listen(res, (long)(arg0), (long)(arg1))
+#define __sanitizer_syscall_pre_poll(ufds, nfds, timeout) \
+  __sanitizer_syscall_pre_impl_poll((long)(ufds), (long)(nfds), (long)(timeout))
+#define __sanitizer_syscall_post_poll(res, ufds, nfds, timeout)       \
+  __sanitizer_syscall_post_impl_poll(res, (long)(ufds), (long)(nfds), \
+                                     (long)(timeout))
+#define __sanitizer_syscall_pre_select(n, inp, outp, exp, tvp)              \
+  __sanitizer_syscall_pre_impl_select((long)(n), (long)(inp), (long)(outp), \
+                                      (long)(exp), (long)(tvp))
+#define __sanitizer_syscall_post_select(res, n, inp, outp, exp, tvp) \
+  __sanitizer_syscall_post_impl_select(res, (long)(n), (long)(inp),  \
+                                       (long)(outp), (long)(exp), (long)(tvp))
+#define __sanitizer_syscall_pre_old_select(arg) \
+  __sanitizer_syscall_pre_impl_old_select((long)(arg))
+#define __sanitizer_syscall_post_old_select(res, arg) \
+  __sanitizer_syscall_post_impl_old_select(res, (long)(arg))
+#define __sanitizer_syscall_pre_epoll_create(size) \
+  __sanitizer_syscall_pre_impl_epoll_create((long)(size))
+#define __sanitizer_syscall_post_epoll_create(res, size) \
+  __sanitizer_syscall_post_impl_epoll_create(res, (long)(size))
+#define __sanitizer_syscall_pre_epoll_create1(flags) \
+  __sanitizer_syscall_pre_impl_epoll_create1((long)(flags))
+#define __sanitizer_syscall_post_epoll_create1(res, flags) \
+  __sanitizer_syscall_post_impl_epoll_create1(res, (long)(flags))
+#define __sanitizer_syscall_pre_epoll_ctl(epfd, op, fd, event)                 \
+  __sanitizer_syscall_pre_impl_epoll_ctl((long)(epfd), (long)(op), (long)(fd), \
+                                         (long)(event))
+#define __sanitizer_syscall_post_epoll_ctl(res, epfd, op, fd, event)     \
+  __sanitizer_syscall_post_impl_epoll_ctl(res, (long)(epfd), (long)(op), \
+                                          (long)(fd), (long)(event))
+#define __sanitizer_syscall_pre_epoll_wait(epfd, events, maxevents, timeout) \
+  __sanitizer_syscall_pre_impl_epoll_wait((long)(epfd), (long)(events),      \
+                                          (long)(maxevents), (long)(timeout))
+#define __sanitizer_syscall_post_epoll_wait(res, epfd, events, maxevents,     \
+                                            timeout)                          \
+  __sanitizer_syscall_post_impl_epoll_wait(res, (long)(epfd), (long)(events), \
+                                           (long)(maxevents), (long)(timeout))
+#define __sanitizer_syscall_pre_epoll_pwait(epfd, events, maxevents, timeout, \
+                                            sigmask, sigsetsize)              \
+  __sanitizer_syscall_pre_impl_epoll_pwait(                                   \
+      (long)(epfd), (long)(events), (long)(maxevents), (long)(timeout),       \
+      (long)(sigmask), (long)(sigsetsize))
+#define __sanitizer_syscall_post_epoll_pwait(res, epfd, events, maxevents,   \
+                                             timeout, sigmask, sigsetsize)   \
+  __sanitizer_syscall_post_impl_epoll_pwait(                                 \
+      res, (long)(epfd), (long)(events), (long)(maxevents), (long)(timeout), \
+      (long)(sigmask), (long)(sigsetsize))
+#define __sanitizer_syscall_pre_gethostname(name, len) \
+  __sanitizer_syscall_pre_impl_gethostname((long)(name), (long)(len))
+#define __sanitizer_syscall_post_gethostname(res, name, len) \
+  __sanitizer_syscall_post_impl_gethostname(res, (long)(name), (long)(len))
+#define __sanitizer_syscall_pre_sethostname(name, len) \
+  __sanitizer_syscall_pre_impl_sethostname((long)(name), (long)(len))
+#define __sanitizer_syscall_post_sethostname(res, name, len) \
+  __sanitizer_syscall_post_impl_sethostname(res, (long)(name), (long)(len))
+#define __sanitizer_syscall_pre_setdomainname(name, len) \
+  __sanitizer_syscall_pre_impl_setdomainname((long)(name), (long)(len))
+#define __sanitizer_syscall_post_setdomainname(res, name, len) \
+  __sanitizer_syscall_post_impl_setdomainname(res, (long)(name), (long)(len))
+#define __sanitizer_syscall_pre_newuname(name) \
+  __sanitizer_syscall_pre_impl_newuname((long)(name))
+#define __sanitizer_syscall_post_newuname(res, name) \
+  __sanitizer_syscall_post_impl_newuname(res, (long)(name))
+#define __sanitizer_syscall_pre_uname(arg0) \
+  __sanitizer_syscall_pre_impl_uname((long)(arg0))
+#define __sanitizer_syscall_post_uname(res, arg0) \
+  __sanitizer_syscall_post_impl_uname(res, (long)(arg0))
+#define __sanitizer_syscall_pre_olduname(arg0) \
+  __sanitizer_syscall_pre_impl_olduname((long)(arg0))
+#define __sanitizer_syscall_post_olduname(res, arg0) \
+  __sanitizer_syscall_post_impl_olduname(res, (long)(arg0))
+#define __sanitizer_syscall_pre_getrlimit(resource, rlim) \
+  __sanitizer_syscall_pre_impl_getrlimit((long)(resource), (long)(rlim))
+#define __sanitizer_syscall_post_getrlimit(res, resource, rlim) \
+  __sanitizer_syscall_post_impl_getrlimit(res, (long)(resource), (long)(rlim))
+#define __sanitizer_syscall_pre_old_getrlimit(resource, rlim) \
+  __sanitizer_syscall_pre_impl_old_getrlimit((long)(resource), (long)(rlim))
+#define __sanitizer_syscall_post_old_getrlimit(res, resource, rlim)  \
+  __sanitizer_syscall_post_impl_old_getrlimit(res, (long)(resource), \
+                                              (long)(rlim))
+#define __sanitizer_syscall_pre_setrlimit(resource, rlim) \
+  __sanitizer_syscall_pre_impl_setrlimit((long)(resource), (long)(rlim))
+#define __sanitizer_syscall_post_setrlimit(res, resource, rlim) \
+  __sanitizer_syscall_post_impl_setrlimit(res, (long)(resource), (long)(rlim))
+#define __sanitizer_syscall_pre_prlimit64(pid, resource, new_rlim, old_rlim) \
+  __sanitizer_syscall_pre_impl_prlimit64((long)(pid), (long)(resource),      \
+                                         (long)(new_rlim), (long)(old_rlim))
+#define __sanitizer_syscall_post_prlimit64(res, pid, resource, new_rlim,      \
+                                           old_rlim)                          \
+  __sanitizer_syscall_post_impl_prlimit64(res, (long)(pid), (long)(resource), \
+                                          (long)(new_rlim), (long)(old_rlim))
+#define __sanitizer_syscall_pre_getrusage(who, ru) \
+  __sanitizer_syscall_pre_impl_getrusage((long)(who), (long)(ru))
+#define __sanitizer_syscall_post_getrusage(res, who, ru) \
+  __sanitizer_syscall_post_impl_getrusage(res, (long)(who), (long)(ru))
+#define __sanitizer_syscall_pre_umask(mask) \
+  __sanitizer_syscall_pre_impl_umask((long)(mask))
+#define __sanitizer_syscall_post_umask(res, mask) \
+  __sanitizer_syscall_post_impl_umask(res, (long)(mask))
+#define __sanitizer_syscall_pre_msgget(key, msgflg) \
+  __sanitizer_syscall_pre_impl_msgget((long)(key), (long)(msgflg))
+#define __sanitizer_syscall_post_msgget(res, key, msgflg) \
+  __sanitizer_syscall_post_impl_msgget(res, (long)(key), (long)(msgflg))
+#define __sanitizer_syscall_pre_msgsnd(msqid, msgp, msgsz, msgflg) \
+  __sanitizer_syscall_pre_impl_msgsnd((long)(msqid), (long)(msgp), \
+                                      (long)(msgsz), (long)(msgflg))
+#define __sanitizer_syscall_post_msgsnd(res, msqid, msgp, msgsz, msgflg) \
+  __sanitizer_syscall_post_impl_msgsnd(res, (long)(msqid), (long)(msgp), \
+                                       (long)(msgsz), (long)(msgflg))
+#define __sanitizer_syscall_pre_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg) \
+  __sanitizer_syscall_pre_impl_msgrcv((long)(msqid), (long)(msgp),         \
+                                      (long)(msgsz), (long)(msgtyp),       \
+                                      (long)(msgflg))
+#define __sanitizer_syscall_post_msgrcv(res, msqid, msgp, msgsz, msgtyp, \
+                                        msgflg)                          \
+  __sanitizer_syscall_post_impl_msgrcv(res, (long)(msqid), (long)(msgp), \
+                                       (long)(msgsz), (long)(msgtyp),    \
+                                       (long)(msgflg))
+#define __sanitizer_syscall_pre_msgctl(msqid, cmd, buf) \
+  __sanitizer_syscall_pre_impl_msgctl((long)(msqid), (long)(cmd), (long)(buf))
+#define __sanitizer_syscall_post_msgctl(res, msqid, cmd, buf)           \
+  __sanitizer_syscall_post_impl_msgctl(res, (long)(msqid), (long)(cmd), \
+                                       (long)(buf))
+#define __sanitizer_syscall_pre_semget(key, nsems, semflg)        \
+  __sanitizer_syscall_pre_impl_semget((long)(key), (long)(nsems), \
+                                      (long)(semflg))
+#define __sanitizer_syscall_post_semget(res, key, nsems, semflg)        \
+  __sanitizer_syscall_post_impl_semget(res, (long)(key), (long)(nsems), \
+                                       (long)(semflg))
+#define __sanitizer_syscall_pre_semop(semid, sops, nsops) \
+  __sanitizer_syscall_pre_impl_semop((long)(semid), (long)(sops), (long)(nsops))
+#define __sanitizer_syscall_post_semop(res, semid, sops, nsops)         \
+  __sanitizer_syscall_post_impl_semop(res, (long)(semid), (long)(sops), \
+                                      (long)(nsops))
+#define __sanitizer_syscall_pre_semctl(semid, semnum, cmd, arg)      \
+  __sanitizer_syscall_pre_impl_semctl((long)(semid), (long)(semnum), \
+                                      (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_post_semctl(res, semid, semnum, cmd, arg)      \
+  __sanitizer_syscall_post_impl_semctl(res, (long)(semid), (long)(semnum), \
+                                       (long)(cmd), (long)(arg))
+#define __sanitizer_syscall_pre_semtimedop(semid, sops, nsops, timeout) \
+  __sanitizer_syscall_pre_impl_semtimedop((long)(semid), (long)(sops),  \
+                                          (long)(nsops), (long)(timeout))
+#define __sanitizer_syscall_post_semtimedop(res, semid, sops, nsops, timeout) \
+  __sanitizer_syscall_post_impl_semtimedop(res, (long)(semid), (long)(sops),  \
+                                           (long)(nsops), (long)(timeout))
+#define __sanitizer_syscall_pre_shmat(shmid, shmaddr, shmflg)        \
+  __sanitizer_syscall_pre_impl_shmat((long)(shmid), (long)(shmaddr), \
+                                     (long)(shmflg))
+#define __sanitizer_syscall_post_shmat(res, shmid, shmaddr, shmflg)        \
+  __sanitizer_syscall_post_impl_shmat(res, (long)(shmid), (long)(shmaddr), \
+                                      (long)(shmflg))
+#define __sanitizer_syscall_pre_shmget(key, size, flag) \
+  __sanitizer_syscall_pre_impl_shmget((long)(key), (long)(size), (long)(flag))
+#define __sanitizer_syscall_post_shmget(res, key, size, flag)          \
+  __sanitizer_syscall_post_impl_shmget(res, (long)(key), (long)(size), \
+                                       (long)(flag))
+#define __sanitizer_syscall_pre_shmdt(shmaddr) \
+  __sanitizer_syscall_pre_impl_shmdt((long)(shmaddr))
+#define __sanitizer_syscall_post_shmdt(res, shmaddr) \
+  __sanitizer_syscall_post_impl_shmdt(res, (long)(shmaddr))
+#define __sanitizer_syscall_pre_shmctl(shmid, cmd, buf) \
+  __sanitizer_syscall_pre_impl_shmctl((long)(shmid), (long)(cmd), (long)(buf))
+#define __sanitizer_syscall_post_shmctl(res, shmid, cmd, buf)           \
+  __sanitizer_syscall_post_impl_shmctl(res, (long)(shmid), (long)(cmd), \
+                                       (long)(buf))
+#define __sanitizer_syscall_pre_ipc(call, first, second, third, ptr, fifth)    \
+  __sanitizer_syscall_pre_impl_ipc((long)(call), (long)(first),                \
+                                   (long)(second), (long)(third), (long)(ptr), \
+                                   (long)(fifth))
+#define __sanitizer_syscall_post_ipc(res, call, first, second, third, ptr, \
+                                     fifth)                                \
+  __sanitizer_syscall_post_impl_ipc(res, (long)(call), (long)(first),      \
+                                    (long)(second), (long)(third),         \
+                                    (long)(ptr), (long)(fifth))
+#define __sanitizer_syscall_pre_mq_open(name, oflag, mode, attr)    \
+  __sanitizer_syscall_pre_impl_mq_open((long)(name), (long)(oflag), \
+                                       (long)(mode), (long)(attr))
+#define __sanitizer_syscall_post_mq_open(res, name, oflag, mode, attr)    \
+  __sanitizer_syscall_post_impl_mq_open(res, (long)(name), (long)(oflag), \
+                                        (long)(mode), (long)(attr))
+#define __sanitizer_syscall_pre_mq_unlink(name) \
+  __sanitizer_syscall_pre_impl_mq_unlink((long)(name))
+#define __sanitizer_syscall_post_mq_unlink(res, name) \
+  __sanitizer_syscall_post_impl_mq_unlink(res, (long)(name))
+#define __sanitizer_syscall_pre_mq_timedsend(mqdes, msg_ptr, msg_len,          \
+                                             msg_prio, abs_timeout)            \
+  __sanitizer_syscall_pre_impl_mq_timedsend((long)(mqdes), (long)(msg_ptr),    \
+                                            (long)(msg_len), (long)(msg_prio), \
+                                            (long)(abs_timeout))
+#define __sanitizer_syscall_post_mq_timedsend(res, mqdes, msg_ptr, msg_len,   \
+                                              msg_prio, abs_timeout)          \
+  __sanitizer_syscall_post_impl_mq_timedsend(                                 \
+      res, (long)(mqdes), (long)(msg_ptr), (long)(msg_len), (long)(msg_prio), \
+      (long)(abs_timeout))
+#define __sanitizer_syscall_pre_mq_timedreceive(mqdes, msg_ptr, msg_len, \
+                                                msg_prio, abs_timeout)   \
+  __sanitizer_syscall_pre_impl_mq_timedreceive(                          \
+      (long)(mqdes), (long)(msg_ptr), (long)(msg_len), (long)(msg_prio), \
+      (long)(abs_timeout))
+#define __sanitizer_syscall_post_mq_timedreceive(res, mqdes, msg_ptr, msg_len, \
+                                                 msg_prio, abs_timeout)        \
+  __sanitizer_syscall_post_impl_mq_timedreceive(                               \
+      res, (long)(mqdes), (long)(msg_ptr), (long)(msg_len), (long)(msg_prio),  \
+      (long)(abs_timeout))
+#define __sanitizer_syscall_pre_mq_notify(mqdes, notification) \
+  __sanitizer_syscall_pre_impl_mq_notify((long)(mqdes), (long)(notification))
+#define __sanitizer_syscall_post_mq_notify(res, mqdes, notification) \
+  __sanitizer_syscall_post_impl_mq_notify(res, (long)(mqdes),        \
+                                          (long)(notification))
+#define __sanitizer_syscall_pre_mq_getsetattr(mqdes, mqstat, omqstat)       \
+  __sanitizer_syscall_pre_impl_mq_getsetattr((long)(mqdes), (long)(mqstat), \
+                                             (long)(omqstat))
+#define __sanitizer_syscall_post_mq_getsetattr(res, mqdes, mqstat, omqstat) \
+  __sanitizer_syscall_post_impl_mq_getsetattr(res, (long)(mqdes),           \
+                                              (long)(mqstat), (long)(omqstat))
+#define __sanitizer_syscall_pre_pciconfig_iobase(which, bus, devfn)         \
+  __sanitizer_syscall_pre_impl_pciconfig_iobase((long)(which), (long)(bus), \
+                                                (long)(devfn))
+#define __sanitizer_syscall_post_pciconfig_iobase(res, which, bus, devfn) \
+  __sanitizer_syscall_post_impl_pciconfig_iobase(res, (long)(which),      \
+                                                 (long)(bus), (long)(devfn))
+#define __sanitizer_syscall_pre_pciconfig_read(bus, dfn, off, len, buf) \
+  __sanitizer_syscall_pre_impl_pciconfig_read(                          \
+      (long)(bus), (long)(dfn), (long)(off), (long)(len), (long)(buf))
+#define __sanitizer_syscall_post_pciconfig_read(res, bus, dfn, off, len, buf) \
+  __sanitizer_syscall_post_impl_pciconfig_read(                               \
+      res, (long)(bus), (long)(dfn), (long)(off), (long)(len), (long)(buf))
+#define __sanitizer_syscall_pre_pciconfig_write(bus, dfn, off, len, buf) \
+  __sanitizer_syscall_pre_impl_pciconfig_write(                          \
+      (long)(bus), (long)(dfn), (long)(off), (long)(len), (long)(buf))
+#define __sanitizer_syscall_post_pciconfig_write(res, bus, dfn, off, len, buf) \
+  __sanitizer_syscall_post_impl_pciconfig_write(                               \
+      res, (long)(bus), (long)(dfn), (long)(off), (long)(len), (long)(buf))
+#define __sanitizer_syscall_pre_swapon(specialfile, swap_flags) \
+  __sanitizer_syscall_pre_impl_swapon((long)(specialfile), (long)(swap_flags))
+#define __sanitizer_syscall_post_swapon(res, specialfile, swap_flags) \
+  __sanitizer_syscall_post_impl_swapon(res, (long)(specialfile),      \
+                                       (long)(swap_flags))
+#define __sanitizer_syscall_pre_swapoff(specialfile) \
+  __sanitizer_syscall_pre_impl_swapoff((long)(specialfile))
+#define __sanitizer_syscall_post_swapoff(res, specialfile) \
+  __sanitizer_syscall_post_impl_swapoff(res, (long)(specialfile))
+#define __sanitizer_syscall_pre_sysctl(args) \
+  __sanitizer_syscall_pre_impl_sysctl((long)(args))
+#define __sanitizer_syscall_post_sysctl(res, args) \
+  __sanitizer_syscall_post_impl_sysctl(res, (long)(args))
+#define __sanitizer_syscall_pre_sysinfo(info) \
+  __sanitizer_syscall_pre_impl_sysinfo((long)(info))
+#define __sanitizer_syscall_post_sysinfo(res, info) \
+  __sanitizer_syscall_post_impl_sysinfo(res, (long)(info))
+#define __sanitizer_syscall_pre_sysfs(option, arg1, arg2) \
+  __sanitizer_syscall_pre_impl_sysfs((long)(option), (long)(arg1), (long)(arg2))
+#define __sanitizer_syscall_post_sysfs(res, option, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_sysfs(res, (long)(option), (long)(arg1), \
+                                      (long)(arg2))
+#define __sanitizer_syscall_pre_syslog(type, buf, len) \
+  __sanitizer_syscall_pre_impl_syslog((long)(type), (long)(buf), (long)(len))
+#define __sanitizer_syscall_post_syslog(res, type, buf, len)           \
+  __sanitizer_syscall_post_impl_syslog(res, (long)(type), (long)(buf), \
+                                       (long)(len))
+#define __sanitizer_syscall_pre_uselib(library) \
+  __sanitizer_syscall_pre_impl_uselib((long)(library))
+#define __sanitizer_syscall_post_uselib(res, library) \
+  __sanitizer_syscall_post_impl_uselib(res, (long)(library))
+#define __sanitizer_syscall_pre_ni_syscall() \
+  __sanitizer_syscall_pre_impl_ni_syscall()
+#define __sanitizer_syscall_post_ni_syscall(res) \
+  __sanitizer_syscall_post_impl_ni_syscall(res)
+#define __sanitizer_syscall_pre_ptrace(request, pid, addr, data)    \
+  __sanitizer_syscall_pre_impl_ptrace((long)(request), (long)(pid), \
+                                      (long)(addr), (long)(data))
+#define __sanitizer_syscall_post_ptrace(res, request, pid, addr, data)    \
+  __sanitizer_syscall_post_impl_ptrace(res, (long)(request), (long)(pid), \
+                                       (long)(addr), (long)(data))
+#define __sanitizer_syscall_pre_add_key(_type, _description, _payload, plen, \
+                                        destringid)                          \
+  __sanitizer_syscall_pre_impl_add_key((long)(_type), (long)(_description),  \
+                                       (long)(_payload), (long)(plen),       \
+                                       (long)(destringid))
+#define __sanitizer_syscall_post_add_key(res, _type, _description, _payload, \
+                                         plen, destringid)                   \
+  __sanitizer_syscall_post_impl_add_key(                                     \
+      res, (long)(_type), (long)(_description), (long)(_payload),            \
+      (long)(plen), (long)(destringid))
+#define __sanitizer_syscall_pre_request_key(_type, _description,       \
+                                            _callout_info, destringid) \
+  __sanitizer_syscall_pre_impl_request_key(                            \
+      (long)(_type), (long)(_description), (long)(_callout_info),      \
+      (long)(destringid))
+#define __sanitizer_syscall_post_request_key(res, _type, _description,  \
+                                             _callout_info, destringid) \
+  __sanitizer_syscall_post_impl_request_key(                            \
+      res, (long)(_type), (long)(_description), (long)(_callout_info),  \
+      (long)(destringid))
+#define __sanitizer_syscall_pre_keyctl(cmd, arg2, arg3, arg4, arg5)            \
+  __sanitizer_syscall_pre_impl_keyctl((long)(cmd), (long)(arg2), (long)(arg3), \
+                                      (long)(arg4), (long)(arg5))
+#define __sanitizer_syscall_post_keyctl(res, cmd, arg2, arg3, arg4, arg5) \
+  __sanitizer_syscall_post_impl_keyctl(res, (long)(cmd), (long)(arg2),    \
+                                       (long)(arg3), (long)(arg4),        \
+                                       (long)(arg5))
+#define __sanitizer_syscall_pre_ioprio_set(which, who, ioprio)        \
+  __sanitizer_syscall_pre_impl_ioprio_set((long)(which), (long)(who), \
+                                          (long)(ioprio))
+#define __sanitizer_syscall_post_ioprio_set(res, which, who, ioprio)        \
+  __sanitizer_syscall_post_impl_ioprio_set(res, (long)(which), (long)(who), \
+                                           (long)(ioprio))
+#define __sanitizer_syscall_pre_ioprio_get(which, who) \
+  __sanitizer_syscall_pre_impl_ioprio_get((long)(which), (long)(who))
+#define __sanitizer_syscall_post_ioprio_get(res, which, who) \
+  __sanitizer_syscall_post_impl_ioprio_get(res, (long)(which), (long)(who))
+#define __sanitizer_syscall_pre_set_mempolicy(mode, nmask, maxnode)       \
+  __sanitizer_syscall_pre_impl_set_mempolicy((long)(mode), (long)(nmask), \
+                                             (long)(maxnode))
+#define __sanitizer_syscall_post_set_mempolicy(res, mode, nmask, maxnode) \
+  __sanitizer_syscall_post_impl_set_mempolicy(res, (long)(mode),          \
+                                              (long)(nmask), (long)(maxnode))
+#define __sanitizer_syscall_pre_migrate_pages(pid, maxnode, from, to)      \
+  __sanitizer_syscall_pre_impl_migrate_pages((long)(pid), (long)(maxnode), \
+                                             (long)(from), (long)(to))
+#define __sanitizer_syscall_post_migrate_pages(res, pid, maxnode, from, to) \
+  __sanitizer_syscall_post_impl_migrate_pages(                              \
+      res, (long)(pid), (long)(maxnode), (long)(from), (long)(to))
+#define __sanitizer_syscall_pre_move_pages(pid, nr_pages, pages, nodes,  \
+                                           status, flags)                \
+  __sanitizer_syscall_pre_impl_move_pages((long)(pid), (long)(nr_pages), \
+                                          (long)(pages), (long)(nodes),  \
+                                          (long)(status), (long)(flags))
+#define __sanitizer_syscall_post_move_pages(res, pid, nr_pages, pages, nodes,  \
+                                            status, flags)                     \
+  __sanitizer_syscall_post_impl_move_pages(res, (long)(pid), (long)(nr_pages), \
+                                           (long)(pages), (long)(nodes),       \
+                                           (long)(status), (long)(flags))
+#define __sanitizer_syscall_pre_mbind(start, len, mode, nmask, maxnode, flags) \
+  __sanitizer_syscall_pre_impl_mbind((long)(start), (long)(len), (long)(mode), \
+                                     (long)(nmask), (long)(maxnode),           \
+                                     (long)(flags))
+#define __sanitizer_syscall_post_mbind(res, start, len, mode, nmask, maxnode, \
+                                       flags)                                 \
+  __sanitizer_syscall_post_impl_mbind(res, (long)(start), (long)(len),        \
+                                      (long)(mode), (long)(nmask),            \
+                                      (long)(maxnode), (long)(flags))
+#define __sanitizer_syscall_pre_get_mempolicy(policy, nmask, maxnode, addr, \
+                                              flags)                        \
+  __sanitizer_syscall_pre_impl_get_mempolicy((long)(policy), (long)(nmask), \
+                                             (long)(maxnode), (long)(addr), \
+                                             (long)(flags))
+#define __sanitizer_syscall_post_get_mempolicy(res, policy, nmask, maxnode,   \
+                                               addr, flags)                   \
+  __sanitizer_syscall_post_impl_get_mempolicy(res, (long)(policy),            \
+                                              (long)(nmask), (long)(maxnode), \
+                                              (long)(addr), (long)(flags))
+#define __sanitizer_syscall_pre_inotify_init() \
+  __sanitizer_syscall_pre_impl_inotify_init()
+#define __sanitizer_syscall_post_inotify_init(res) \
+  __sanitizer_syscall_post_impl_inotify_init(res)
+#define __sanitizer_syscall_pre_inotify_init1(flags) \
+  __sanitizer_syscall_pre_impl_inotify_init1((long)(flags))
+#define __sanitizer_syscall_post_inotify_init1(res, flags) \
+  __sanitizer_syscall_post_impl_inotify_init1(res, (long)(flags))
+#define __sanitizer_syscall_pre_inotify_add_watch(fd, path, mask)          \
+  __sanitizer_syscall_pre_impl_inotify_add_watch((long)(fd), (long)(path), \
+                                                 (long)(mask))
+#define __sanitizer_syscall_post_inotify_add_watch(res, fd, path, mask) \
+  __sanitizer_syscall_post_impl_inotify_add_watch(res, (long)(fd),      \
+                                                  (long)(path), (long)(mask))
+#define __sanitizer_syscall_pre_inotify_rm_watch(fd, wd) \
+  __sanitizer_syscall_pre_impl_inotify_rm_watch((long)(fd), (long)(wd))
+#define __sanitizer_syscall_post_inotify_rm_watch(res, fd, wd) \
+  __sanitizer_syscall_post_impl_inotify_rm_watch(res, (long)(fd), (long)(wd))
+#define __sanitizer_syscall_pre_spu_run(fd, unpc, ustatus)       \
+  __sanitizer_syscall_pre_impl_spu_run((long)(fd), (long)(unpc), \
+                                       (long)(ustatus))
+#define __sanitizer_syscall_post_spu_run(res, fd, unpc, ustatus)       \
+  __sanitizer_syscall_post_impl_spu_run(res, (long)(fd), (long)(unpc), \
+                                        (long)(ustatus))
+#define __sanitizer_syscall_pre_spu_create(name, flags, mode, fd)      \
+  __sanitizer_syscall_pre_impl_spu_create((long)(name), (long)(flags), \
+                                          (long)(mode), (long)(fd))
+#define __sanitizer_syscall_post_spu_create(res, name, flags, mode, fd)      \
+  __sanitizer_syscall_post_impl_spu_create(res, (long)(name), (long)(flags), \
+                                           (long)(mode), (long)(fd))
+#define __sanitizer_syscall_pre_mknodat(dfd, filename, mode, dev)     \
+  __sanitizer_syscall_pre_impl_mknodat((long)(dfd), (long)(filename), \
+                                       (long)(mode), (long)(dev))
+#define __sanitizer_syscall_post_mknodat(res, dfd, filename, mode, dev)     \
+  __sanitizer_syscall_post_impl_mknodat(res, (long)(dfd), (long)(filename), \
+                                        (long)(mode), (long)(dev))
+#define __sanitizer_syscall_pre_mkdirat(dfd, pathname, mode)          \
+  __sanitizer_syscall_pre_impl_mkdirat((long)(dfd), (long)(pathname), \
+                                       (long)(mode))
+#define __sanitizer_syscall_post_mkdirat(res, dfd, pathname, mode)          \
+  __sanitizer_syscall_post_impl_mkdirat(res, (long)(dfd), (long)(pathname), \
+                                        (long)(mode))
+#define __sanitizer_syscall_pre_unlinkat(dfd, pathname, flag)          \
+  __sanitizer_syscall_pre_impl_unlinkat((long)(dfd), (long)(pathname), \
+                                        (long)(flag))
+#define __sanitizer_syscall_post_unlinkat(res, dfd, pathname, flag)          \
+  __sanitizer_syscall_post_impl_unlinkat(res, (long)(dfd), (long)(pathname), \
+                                         (long)(flag))
+#define __sanitizer_syscall_pre_symlinkat(oldname, newdfd, newname)       \
+  __sanitizer_syscall_pre_impl_symlinkat((long)(oldname), (long)(newdfd), \
+                                         (long)(newname))
+#define __sanitizer_syscall_post_symlinkat(res, oldname, newdfd, newname) \
+  __sanitizer_syscall_post_impl_symlinkat(res, (long)(oldname),           \
+                                          (long)(newdfd), (long)(newname))
+#define __sanitizer_syscall_pre_linkat(olddfd, oldname, newdfd, newname, \
+                                       flags)                            \
+  __sanitizer_syscall_pre_impl_linkat((long)(olddfd), (long)(oldname),   \
+                                      (long)(newdfd), (long)(newname),   \
+                                      (long)(flags))
+#define __sanitizer_syscall_post_linkat(res, olddfd, oldname, newdfd, newname, \
+                                        flags)                                 \
+  __sanitizer_syscall_post_impl_linkat(res, (long)(olddfd), (long)(oldname),   \
+                                       (long)(newdfd), (long)(newname),        \
+                                       (long)(flags))
+#define __sanitizer_syscall_pre_renameat(olddfd, oldname, newdfd, newname) \
+  __sanitizer_syscall_pre_impl_renameat((long)(olddfd), (long)(oldname),   \
+                                        (long)(newdfd), (long)(newname))
+#define __sanitizer_syscall_post_renameat(res, olddfd, oldname, newdfd,        \
+                                          newname)                             \
+  __sanitizer_syscall_post_impl_renameat(res, (long)(olddfd), (long)(oldname), \
+                                         (long)(newdfd), (long)(newname))
+#define __sanitizer_syscall_pre_futimesat(dfd, filename, utimes)        \
+  __sanitizer_syscall_pre_impl_futimesat((long)(dfd), (long)(filename), \
+                                         (long)(utimes))
+#define __sanitizer_syscall_post_futimesat(res, dfd, filename, utimes)        \
+  __sanitizer_syscall_post_impl_futimesat(res, (long)(dfd), (long)(filename), \
+                                          (long)(utimes))
+#define __sanitizer_syscall_pre_faccessat(dfd, filename, mode)          \
+  __sanitizer_syscall_pre_impl_faccessat((long)(dfd), (long)(filename), \
+                                         (long)(mode))
+#define __sanitizer_syscall_post_faccessat(res, dfd, filename, mode)          \
+  __sanitizer_syscall_post_impl_faccessat(res, (long)(dfd), (long)(filename), \
+                                          (long)(mode))
+#define __sanitizer_syscall_pre_fchmodat(dfd, filename, mode)          \
+  __sanitizer_syscall_pre_impl_fchmodat((long)(dfd), (long)(filename), \
+                                        (long)(mode))
+#define __sanitizer_syscall_post_fchmodat(res, dfd, filename, mode)          \
+  __sanitizer_syscall_post_impl_fchmodat(res, (long)(dfd), (long)(filename), \
+                                         (long)(mode))
+#define __sanitizer_syscall_pre_fchownat(dfd, filename, user, group, flag) \
+  __sanitizer_syscall_pre_impl_fchownat((long)(dfd), (long)(filename),     \
+                                        (long)(user), (long)(group),       \
+                                        (long)(flag))
+#define __sanitizer_syscall_post_fchownat(res, dfd, filename, user, group,   \
+                                          flag)                              \
+  __sanitizer_syscall_post_impl_fchownat(res, (long)(dfd), (long)(filename), \
+                                         (long)(user), (long)(group),        \
+                                         (long)(flag))
+#define __sanitizer_syscall_pre_openat(dfd, filename, flags, mode)   \
+  __sanitizer_syscall_pre_impl_openat((long)(dfd), (long)(filename), \
+                                      (long)(flags), (long)(mode))
+#define __sanitizer_syscall_post_openat(res, dfd, filename, flags, mode)   \
+  __sanitizer_syscall_post_impl_openat(res, (long)(dfd), (long)(filename), \
+                                       (long)(flags), (long)(mode))
+#define __sanitizer_syscall_pre_newfstatat(dfd, filename, statbuf, flag) \
+  __sanitizer_syscall_pre_impl_newfstatat((long)(dfd), (long)(filename), \
+                                          (long)(statbuf), (long)(flag))
+#define __sanitizer_syscall_post_newfstatat(res, dfd, filename, statbuf, flag) \
+  __sanitizer_syscall_post_impl_newfstatat(res, (long)(dfd), (long)(filename), \
+                                           (long)(statbuf), (long)(flag))
+#define __sanitizer_syscall_pre_fstatat64(dfd, filename, statbuf, flag) \
+  __sanitizer_syscall_pre_impl_fstatat64((long)(dfd), (long)(filename), \
+                                         (long)(statbuf), (long)(flag))
+#define __sanitizer_syscall_post_fstatat64(res, dfd, filename, statbuf, flag) \
+  __sanitizer_syscall_post_impl_fstatat64(res, (long)(dfd), (long)(filename), \
+                                          (long)(statbuf), (long)(flag))
+#define __sanitizer_syscall_pre_readlinkat(dfd, path, buf, bufsiz)   \
+  __sanitizer_syscall_pre_impl_readlinkat((long)(dfd), (long)(path), \
+                                          (long)(buf), (long)(bufsiz))
+#define __sanitizer_syscall_post_readlinkat(res, dfd, path, buf, bufsiz)   \
+  __sanitizer_syscall_post_impl_readlinkat(res, (long)(dfd), (long)(path), \
+                                           (long)(buf), (long)(bufsiz))
+#define __sanitizer_syscall_pre_utimensat(dfd, filename, utimes, flags) \
+  __sanitizer_syscall_pre_impl_utimensat((long)(dfd), (long)(filename), \
+                                         (long)(utimes), (long)(flags))
+#define __sanitizer_syscall_post_utimensat(res, dfd, filename, utimes, flags) \
+  __sanitizer_syscall_post_impl_utimensat(res, (long)(dfd), (long)(filename), \
+                                          (long)(utimes), (long)(flags))
+#define __sanitizer_syscall_pre_unshare(unshare_flags) \
+  __sanitizer_syscall_pre_impl_unshare((long)(unshare_flags))
+#define __sanitizer_syscall_post_unshare(res, unshare_flags) \
+  __sanitizer_syscall_post_impl_unshare(res, (long)(unshare_flags))
+#define __sanitizer_syscall_pre_splice(fd_in, off_in, fd_out, off_out, len, \
+                                       flags)                               \
+  __sanitizer_syscall_pre_impl_splice((long)(fd_in), (long)(off_in),        \
+                                      (long)(fd_out), (long)(off_out),      \
+                                      (long)(len), (long)(flags))
+#define __sanitizer_syscall_post_splice(res, fd_in, off_in, fd_out, off_out, \
+                                        len, flags)                          \
+  __sanitizer_syscall_post_impl_splice(res, (long)(fd_in), (long)(off_in),   \
+                                       (long)(fd_out), (long)(off_out),      \
+                                       (long)(len), (long)(flags))
+#define __sanitizer_syscall_pre_vmsplice(fd, iov, nr_segs, flags) \
+  __sanitizer_syscall_pre_impl_vmsplice((long)(fd), (long)(iov),  \
+                                        (long)(nr_segs), (long)(flags))
+#define __sanitizer_syscall_post_vmsplice(res, fd, iov, nr_segs, flags) \
+  __sanitizer_syscall_post_impl_vmsplice(res, (long)(fd), (long)(iov),  \
+                                         (long)(nr_segs), (long)(flags))
+#define __sanitizer_syscall_pre_tee(fdin, fdout, len, flags)                 \
+  __sanitizer_syscall_pre_impl_tee((long)(fdin), (long)(fdout), (long)(len), \
+                                   (long)(flags))
+#define __sanitizer_syscall_post_tee(res, fdin, fdout, len, flags)    \
+  __sanitizer_syscall_post_impl_tee(res, (long)(fdin), (long)(fdout), \
+                                    (long)(len), (long)(flags))
+#define __sanitizer_syscall_pre_get_robust_list(pid, head_ptr, len_ptr)       \
+  __sanitizer_syscall_pre_impl_get_robust_list((long)(pid), (long)(head_ptr), \
+                                               (long)(len_ptr))
+#define __sanitizer_syscall_post_get_robust_list(res, pid, head_ptr, len_ptr) \
+  __sanitizer_syscall_post_impl_get_robust_list(                              \
+      res, (long)(pid), (long)(head_ptr), (long)(len_ptr))
+#define __sanitizer_syscall_pre_set_robust_list(head, len) \
+  __sanitizer_syscall_pre_impl_set_robust_list((long)(head), (long)(len))
+#define __sanitizer_syscall_post_set_robust_list(res, head, len) \
+  __sanitizer_syscall_post_impl_set_robust_list(res, (long)(head), (long)(len))
+#define __sanitizer_syscall_pre_getcpu(cpu, node, cache) \
+  __sanitizer_syscall_pre_impl_getcpu((long)(cpu), (long)(node), (long)(cache))
+#define __sanitizer_syscall_post_getcpu(res, cpu, node, cache)         \
+  __sanitizer_syscall_post_impl_getcpu(res, (long)(cpu), (long)(node), \
+                                       (long)(cache))
+#define __sanitizer_syscall_pre_signalfd(ufd, user_mask, sizemask)      \
+  __sanitizer_syscall_pre_impl_signalfd((long)(ufd), (long)(user_mask), \
+                                        (long)(sizemask))
+#define __sanitizer_syscall_post_signalfd(res, ufd, user_mask, sizemask)      \
+  __sanitizer_syscall_post_impl_signalfd(res, (long)(ufd), (long)(user_mask), \
+                                         (long)(sizemask))
+#define __sanitizer_syscall_pre_signalfd4(ufd, user_mask, sizemask, flags) \
+  __sanitizer_syscall_pre_impl_signalfd4((long)(ufd), (long)(user_mask),   \
+                                         (long)(sizemask), (long)(flags))
+#define __sanitizer_syscall_post_signalfd4(res, ufd, user_mask, sizemask,      \
+                                           flags)                              \
+  __sanitizer_syscall_post_impl_signalfd4(res, (long)(ufd), (long)(user_mask), \
+                                          (long)(sizemask), (long)(flags))
+#define __sanitizer_syscall_pre_timerfd_create(clockid, flags) \
+  __sanitizer_syscall_pre_impl_timerfd_create((long)(clockid), (long)(flags))
+#define __sanitizer_syscall_post_timerfd_create(res, clockid, flags) \
+  __sanitizer_syscall_post_impl_timerfd_create(res, (long)(clockid), \
+                                               (long)(flags))
+#define __sanitizer_syscall_pre_timerfd_settime(ufd, flags, utmr, otmr)    \
+  __sanitizer_syscall_pre_impl_timerfd_settime((long)(ufd), (long)(flags), \
+                                               (long)(utmr), (long)(otmr))
+#define __sanitizer_syscall_post_timerfd_settime(res, ufd, flags, utmr, otmr) \
+  __sanitizer_syscall_post_impl_timerfd_settime(                              \
+      res, (long)(ufd), (long)(flags), (long)(utmr), (long)(otmr))
+#define __sanitizer_syscall_pre_timerfd_gettime(ufd, otmr) \
+  __sanitizer_syscall_pre_impl_timerfd_gettime((long)(ufd), (long)(otmr))
+#define __sanitizer_syscall_post_timerfd_gettime(res, ufd, otmr) \
+  __sanitizer_syscall_post_impl_timerfd_gettime(res, (long)(ufd), (long)(otmr))
+#define __sanitizer_syscall_pre_eventfd(count) \
+  __sanitizer_syscall_pre_impl_eventfd((long)(count))
+#define __sanitizer_syscall_post_eventfd(res, count) \
+  __sanitizer_syscall_post_impl_eventfd(res, (long)(count))
+#define __sanitizer_syscall_pre_eventfd2(count, flags) \
+  __sanitizer_syscall_pre_impl_eventfd2((long)(count), (long)(flags))
+#define __sanitizer_syscall_post_eventfd2(res, count, flags) \
+  __sanitizer_syscall_post_impl_eventfd2(res, (long)(count), (long)(flags))
+#define __sanitizer_syscall_pre_old_readdir(arg0, arg1, arg2)          \
+  __sanitizer_syscall_pre_impl_old_readdir((long)(arg0), (long)(arg1), \
+                                           (long)(arg2))
+#define __sanitizer_syscall_post_old_readdir(res, arg0, arg1, arg2)          \
+  __sanitizer_syscall_post_impl_old_readdir(res, (long)(arg0), (long)(arg1), \
+                                            (long)(arg2))
+#define __sanitizer_syscall_pre_pselect6(arg0, arg1, arg2, arg3, arg4, arg5) \
+  __sanitizer_syscall_pre_impl_pselect6((long)(arg0), (long)(arg1),          \
+                                        (long)(arg2), (long)(arg3),          \
+                                        (long)(arg4), (long)(arg5))
+#define __sanitizer_syscall_post_pselect6(res, arg0, arg1, arg2, arg3, arg4, \
+                                          arg5)                              \
+  __sanitizer_syscall_post_impl_pselect6(res, (long)(arg0), (long)(arg1),    \
+                                         (long)(arg2), (long)(arg3),         \
+                                         (long)(arg4), (long)(arg5))
+#define __sanitizer_syscall_pre_ppoll(arg0, arg1, arg2, arg3, arg4)            \
+  __sanitizer_syscall_pre_impl_ppoll((long)(arg0), (long)(arg1), (long)(arg2), \
+                                     (long)(arg3), (long)(arg4))
+#define __sanitizer_syscall_post_ppoll(res, arg0, arg1, arg2, arg3, arg4) \
+  __sanitizer_syscall_post_impl_ppoll(res, (long)(arg0), (long)(arg1),    \
+                                      (long)(arg2), (long)(arg3),         \
+                                      (long)(arg4))
+#define __sanitizer_syscall_pre_syncfs(fd) \
+  __sanitizer_syscall_pre_impl_syncfs((long)(fd))
+#define __sanitizer_syscall_post_syncfs(res, fd) \
+  __sanitizer_syscall_post_impl_syncfs(res, (long)(fd))
+#define __sanitizer_syscall_pre_perf_event_open(attr_uptr, pid, cpu, group_fd, \
+                                                flags)                         \
+  __sanitizer_syscall_pre_impl_perf_event_open((long)(attr_uptr), (long)(pid), \
+                                               (long)(cpu), (long)(group_fd),  \
+                                               (long)(flags))
+#define __sanitizer_syscall_post_perf_event_open(res, attr_uptr, pid, cpu, \
+                                                 group_fd, flags)          \
+  __sanitizer_syscall_post_impl_perf_event_open(                           \
+      res, (long)(attr_uptr), (long)(pid), (long)(cpu), (long)(group_fd),  \
+      (long)(flags))
+#define __sanitizer_syscall_pre_mmap_pgoff(addr, len, prot, flags, fd, pgoff) \
+  __sanitizer_syscall_pre_impl_mmap_pgoff((long)(addr), (long)(len),          \
+                                          (long)(prot), (long)(flags),        \
+                                          (long)(fd), (long)(pgoff))
+#define __sanitizer_syscall_post_mmap_pgoff(res, addr, len, prot, flags, fd, \
+                                            pgoff)                           \
+  __sanitizer_syscall_post_impl_mmap_pgoff(res, (long)(addr), (long)(len),   \
+                                           (long)(prot), (long)(flags),      \
+                                           (long)(fd), (long)(pgoff))
+#define __sanitizer_syscall_pre_old_mmap(arg) \
+  __sanitizer_syscall_pre_impl_old_mmap((long)(arg))
+#define __sanitizer_syscall_post_old_mmap(res, arg) \
+  __sanitizer_syscall_post_impl_old_mmap(res, (long)(arg))
+#define __sanitizer_syscall_pre_name_to_handle_at(dfd, name, handle, mnt_id, \
+                                                  flag)                      \
+  __sanitizer_syscall_pre_impl_name_to_handle_at(                            \
+      (long)(dfd), (long)(name), (long)(handle), (long)(mnt_id), (long)(flag))
+#define __sanitizer_syscall_post_name_to_handle_at(res, dfd, name, handle, \
+                                                   mnt_id, flag)           \
+  __sanitizer_syscall_post_impl_name_to_handle_at(                         \
+      res, (long)(dfd), (long)(name), (long)(handle), (long)(mnt_id),      \
+      (long)(flag))
+#define __sanitizer_syscall_pre_open_by_handle_at(mountdirfd, handle, flags) \
+  __sanitizer_syscall_pre_impl_open_by_handle_at(                            \
+      (long)(mountdirfd), (long)(handle), (long)(flags))
+#define __sanitizer_syscall_post_open_by_handle_at(res, mountdirfd, handle, \
+                                                   flags)                   \
+  __sanitizer_syscall_post_impl_open_by_handle_at(                          \
+      res, (long)(mountdirfd), (long)(handle), (long)(flags))
+#define __sanitizer_syscall_pre_setns(fd, nstype) \
+  __sanitizer_syscall_pre_impl_setns((long)(fd), (long)(nstype))
+#define __sanitizer_syscall_post_setns(res, fd, nstype) \
+  __sanitizer_syscall_post_impl_setns(res, (long)(fd), (long)(nstype))
+#define __sanitizer_syscall_pre_process_vm_readv(pid, lvec, liovcnt, rvec, \
+                                                 riovcnt, flags)           \
+  __sanitizer_syscall_pre_impl_process_vm_readv(                           \
+      (long)(pid), (long)(lvec), (long)(liovcnt), (long)(rvec),            \
+      (long)(riovcnt), (long)(flags))
+#define __sanitizer_syscall_post_process_vm_readv(res, pid, lvec, liovcnt, \
+                                                  rvec, riovcnt, flags)    \
+  __sanitizer_syscall_post_impl_process_vm_readv(                          \
+      res, (long)(pid), (long)(lvec), (long)(liovcnt), (long)(rvec),       \
+      (long)(riovcnt), (long)(flags))
+#define __sanitizer_syscall_pre_process_vm_writev(pid, lvec, liovcnt, rvec, \
+                                                  riovcnt, flags)           \
+  __sanitizer_syscall_pre_impl_process_vm_writev(                           \
+      (long)(pid), (long)(lvec), (long)(liovcnt), (long)(rvec),             \
+      (long)(riovcnt), (long)(flags))
+#define __sanitizer_syscall_post_process_vm_writev(res, pid, lvec, liovcnt, \
+                                                   rvec, riovcnt, flags)    \
+  __sanitizer_syscall_post_impl_process_vm_writev(                          \
+      res, (long)(pid), (long)(lvec), (long)(liovcnt), (long)(rvec),        \
+      (long)(riovcnt), (long)(flags))
+#define __sanitizer_syscall_pre_fork() \
+  __sanitizer_syscall_pre_impl_fork()
+#define __sanitizer_syscall_post_fork(res) \
+  __sanitizer_syscall_post_impl_fork(res)
+#define __sanitizer_syscall_pre_vfork() \
+  __sanitizer_syscall_pre_impl_vfork()
+#define __sanitizer_syscall_post_vfork(res) \
+  __sanitizer_syscall_post_impl_vfork(res)
+#define __sanitizer_syscall_pre_sigaction(signum, act, oldact)                 \
+  __sanitizer_syscall_pre_impl_sigaction((long)signum, (long)act, (long)oldact)
+#define __sanitizer_syscall_post_sigaction(res, signum, act, oldact)           \
+  __sanitizer_syscall_post_impl_sigaction(res, (long)signum, (long)act,        \
+                                          (long)oldact)
+#define __sanitizer_syscall_pre_rt_sigaction(signum, act, oldact, sz)          \
+  __sanitizer_syscall_pre_impl_rt_sigaction((long)signum, (long)act,           \
+                                            (long)oldact, (long)sz)
+#define __sanitizer_syscall_post_rt_sigaction(res, signum, act, oldact, sz)    \
+  __sanitizer_syscall_post_impl_rt_sigaction(res, (long)signum, (long)act,     \
+                                             (long)oldact, (long)sz)
+
+// And now a few syscalls we don't handle yet.
+#define __sanitizer_syscall_pre_afs_syscall(...)
+#define __sanitizer_syscall_pre_arch_prctl(...)
+#define __sanitizer_syscall_pre_break(...)
+#define __sanitizer_syscall_pre_chown32(...)
+#define __sanitizer_syscall_pre_clone(...)
+#define __sanitizer_syscall_pre_create_module(...)
+#define __sanitizer_syscall_pre_epoll_ctl_old(...)
+#define __sanitizer_syscall_pre_epoll_wait_old(...)
+#define __sanitizer_syscall_pre_execve(...)
+#define __sanitizer_syscall_pre_fadvise64(...)
+#define __sanitizer_syscall_pre_fadvise64_64(...)
+#define __sanitizer_syscall_pre_fallocate(...)
+#define __sanitizer_syscall_pre_fanotify_init(...)
+#define __sanitizer_syscall_pre_fanotify_mark(...)
+#define __sanitizer_syscall_pre_fchown32(...)
+#define __sanitizer_syscall_pre_ftime(...)
+#define __sanitizer_syscall_pre_ftruncate64(...)
+#define __sanitizer_syscall_pre_futex(...)
+#define __sanitizer_syscall_pre_getegid32(...)
+#define __sanitizer_syscall_pre_geteuid32(...)
+#define __sanitizer_syscall_pre_getgid32(...)
+#define __sanitizer_syscall_pre_getgroups32(...)
+#define __sanitizer_syscall_pre_get_kernel_syms(...)
+#define __sanitizer_syscall_pre_getpmsg(...)
+#define __sanitizer_syscall_pre_getresgid32(...)
+#define __sanitizer_syscall_pre_getresuid32(...)
+#define __sanitizer_syscall_pre_get_thread_area(...)
+#define __sanitizer_syscall_pre_getuid32(...)
+#define __sanitizer_syscall_pre_gtty(...)
+#define __sanitizer_syscall_pre_idle(...)
+#define __sanitizer_syscall_pre_iopl(...)
+#define __sanitizer_syscall_pre_lchown32(...)
+#define __sanitizer_syscall_pre__llseek(...)
+#define __sanitizer_syscall_pre_lock(...)
+#define __sanitizer_syscall_pre_madvise1(...)
+#define __sanitizer_syscall_pre_mmap(...)
+#define __sanitizer_syscall_pre_mmap2(...)
+#define __sanitizer_syscall_pre_modify_ldt(...)
+#define __sanitizer_syscall_pre_mpx(...)
+#define __sanitizer_syscall_pre__newselect(...)
+#define __sanitizer_syscall_pre_nfsservctl(...)
+#define __sanitizer_syscall_pre_oldfstat(...)
+#define __sanitizer_syscall_pre_oldlstat(...)
+#define __sanitizer_syscall_pre_oldolduname(...)
+#define __sanitizer_syscall_pre_oldstat(...)
+#define __sanitizer_syscall_pre_prctl(...)
+#define __sanitizer_syscall_pre_prof(...)
+#define __sanitizer_syscall_pre_profil(...)
+#define __sanitizer_syscall_pre_putpmsg(...)
+#define __sanitizer_syscall_pre_query_module(...)
+#define __sanitizer_syscall_pre_readahead(...)
+#define __sanitizer_syscall_pre_readdir(...)
+#define __sanitizer_syscall_pre_rt_sigreturn(...)
+#define __sanitizer_syscall_pre_rt_sigsuspend(...)
+#define __sanitizer_syscall_pre_security(...)
+#define __sanitizer_syscall_pre_setfsgid32(...)
+#define __sanitizer_syscall_pre_setfsuid32(...)
+#define __sanitizer_syscall_pre_setgid32(...)
+#define __sanitizer_syscall_pre_setgroups32(...)
+#define __sanitizer_syscall_pre_setregid32(...)
+#define __sanitizer_syscall_pre_setresgid32(...)
+#define __sanitizer_syscall_pre_setresuid32(...)
+#define __sanitizer_syscall_pre_setreuid32(...)
+#define __sanitizer_syscall_pre_set_thread_area(...)
+#define __sanitizer_syscall_pre_setuid32(...)
+#define __sanitizer_syscall_pre_sigaltstack(...)
+#define __sanitizer_syscall_pre_sigreturn(...)
+#define __sanitizer_syscall_pre_sigsuspend(...)
+#define __sanitizer_syscall_pre_stty(...)
+#define __sanitizer_syscall_pre_sync_file_range(...)
+#define __sanitizer_syscall_pre__sysctl(...)
+#define __sanitizer_syscall_pre_truncate64(...)
+#define __sanitizer_syscall_pre_tuxcall(...)
+#define __sanitizer_syscall_pre_ugetrlimit(...)
+#define __sanitizer_syscall_pre_ulimit(...)
+#define __sanitizer_syscall_pre_umount2(...)
+#define __sanitizer_syscall_pre_vm86(...)
+#define __sanitizer_syscall_pre_vm86old(...)
+#define __sanitizer_syscall_pre_vserver(...)
+
+#define __sanitizer_syscall_post_afs_syscall(res, ...)
+#define __sanitizer_syscall_post_arch_prctl(res, ...)
+#define __sanitizer_syscall_post_break(res, ...)
+#define __sanitizer_syscall_post_chown32(res, ...)
+#define __sanitizer_syscall_post_clone(res, ...)
+#define __sanitizer_syscall_post_create_module(res, ...)
+#define __sanitizer_syscall_post_epoll_ctl_old(res, ...)
+#define __sanitizer_syscall_post_epoll_wait_old(res, ...)
+#define __sanitizer_syscall_post_execve(res, ...)
+#define __sanitizer_syscall_post_fadvise64(res, ...)
+#define __sanitizer_syscall_post_fadvise64_64(res, ...)
+#define __sanitizer_syscall_post_fallocate(res, ...)
+#define __sanitizer_syscall_post_fanotify_init(res, ...)
+#define __sanitizer_syscall_post_fanotify_mark(res, ...)
+#define __sanitizer_syscall_post_fchown32(res, ...)
+#define __sanitizer_syscall_post_ftime(res, ...)
+#define __sanitizer_syscall_post_ftruncate64(res, ...)
+#define __sanitizer_syscall_post_futex(res, ...)
+#define __sanitizer_syscall_post_getegid32(res, ...)
+#define __sanitizer_syscall_post_geteuid32(res, ...)
+#define __sanitizer_syscall_post_getgid32(res, ...)
+#define __sanitizer_syscall_post_getgroups32(res, ...)
+#define __sanitizer_syscall_post_get_kernel_syms(res, ...)
+#define __sanitizer_syscall_post_getpmsg(res, ...)
+#define __sanitizer_syscall_post_getresgid32(res, ...)
+#define __sanitizer_syscall_post_getresuid32(res, ...)
+#define __sanitizer_syscall_post_get_thread_area(res, ...)
+#define __sanitizer_syscall_post_getuid32(res, ...)
+#define __sanitizer_syscall_post_gtty(res, ...)
+#define __sanitizer_syscall_post_idle(res, ...)
+#define __sanitizer_syscall_post_iopl(res, ...)
+#define __sanitizer_syscall_post_lchown32(res, ...)
+#define __sanitizer_syscall_post__llseek(res, ...)
+#define __sanitizer_syscall_post_lock(res, ...)
+#define __sanitizer_syscall_post_madvise1(res, ...)
+#define __sanitizer_syscall_post_mmap2(res, ...)
+#define __sanitizer_syscall_post_mmap(res, ...)
+#define __sanitizer_syscall_post_modify_ldt(res, ...)
+#define __sanitizer_syscall_post_mpx(res, ...)
+#define __sanitizer_syscall_post__newselect(res, ...)
+#define __sanitizer_syscall_post_nfsservctl(res, ...)
+#define __sanitizer_syscall_post_oldfstat(res, ...)
+#define __sanitizer_syscall_post_oldlstat(res, ...)
+#define __sanitizer_syscall_post_oldolduname(res, ...)
+#define __sanitizer_syscall_post_oldstat(res, ...)
+#define __sanitizer_syscall_post_prctl(res, ...)
+#define __sanitizer_syscall_post_profil(res, ...)
+#define __sanitizer_syscall_post_prof(res, ...)
+#define __sanitizer_syscall_post_putpmsg(res, ...)
+#define __sanitizer_syscall_post_query_module(res, ...)
+#define __sanitizer_syscall_post_readahead(res, ...)
+#define __sanitizer_syscall_post_readdir(res, ...)
+#define __sanitizer_syscall_post_rt_sigreturn(res, ...)
+#define __sanitizer_syscall_post_rt_sigsuspend(res, ...)
+#define __sanitizer_syscall_post_security(res, ...)
+#define __sanitizer_syscall_post_setfsgid32(res, ...)
+#define __sanitizer_syscall_post_setfsuid32(res, ...)
+#define __sanitizer_syscall_post_setgid32(res, ...)
+#define __sanitizer_syscall_post_setgroups32(res, ...)
+#define __sanitizer_syscall_post_setregid32(res, ...)
+#define __sanitizer_syscall_post_setresgid32(res, ...)
+#define __sanitizer_syscall_post_setresuid32(res, ...)
+#define __sanitizer_syscall_post_setreuid32(res, ...)
+#define __sanitizer_syscall_post_set_thread_area(res, ...)
+#define __sanitizer_syscall_post_setuid32(res, ...)
+#define __sanitizer_syscall_post_sigaltstack(res, ...)
+#define __sanitizer_syscall_post_sigreturn(res, ...)
+#define __sanitizer_syscall_post_sigsuspend(res, ...)
+#define __sanitizer_syscall_post_stty(res, ...)
+#define __sanitizer_syscall_post_sync_file_range(res, ...)
+#define __sanitizer_syscall_post__sysctl(res, ...)
+#define __sanitizer_syscall_post_truncate64(res, ...)
+#define __sanitizer_syscall_post_tuxcall(res, ...)
+#define __sanitizer_syscall_post_ugetrlimit(res, ...)
+#define __sanitizer_syscall_post_ulimit(res, ...)
+#define __sanitizer_syscall_post_umount2(res, ...)
+#define __sanitizer_syscall_post_vm86old(res, ...)
+#define __sanitizer_syscall_post_vm86(res, ...)
+#define __sanitizer_syscall_post_vserver(res, ...)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Private declarations. Do not call directly from user code. Use macros above.
+void __sanitizer_syscall_pre_impl_time(long tloc);
+void __sanitizer_syscall_post_impl_time(long res, long tloc);
+void __sanitizer_syscall_pre_impl_stime(long tptr);
+void __sanitizer_syscall_post_impl_stime(long res, long tptr);
+void __sanitizer_syscall_pre_impl_gettimeofday(long tv, long tz);
+void __sanitizer_syscall_post_impl_gettimeofday(long res, long tv, long tz);
+void __sanitizer_syscall_pre_impl_settimeofday(long tv, long tz);
+void __sanitizer_syscall_post_impl_settimeofday(long res, long tv, long tz);
+void __sanitizer_syscall_pre_impl_adjtimex(long txc_p);
+void __sanitizer_syscall_post_impl_adjtimex(long res, long txc_p);
+void __sanitizer_syscall_pre_impl_times(long tbuf);
+void __sanitizer_syscall_post_impl_times(long res, long tbuf);
+void __sanitizer_syscall_pre_impl_gettid();
+void __sanitizer_syscall_post_impl_gettid(long res);
+void __sanitizer_syscall_pre_impl_nanosleep(long rqtp, long rmtp);
+void __sanitizer_syscall_post_impl_nanosleep(long res, long rqtp, long rmtp);
+void __sanitizer_syscall_pre_impl_alarm(long seconds);
+void __sanitizer_syscall_post_impl_alarm(long res, long seconds);
+void __sanitizer_syscall_pre_impl_getpid();
+void __sanitizer_syscall_post_impl_getpid(long res);
+void __sanitizer_syscall_pre_impl_getppid();
+void __sanitizer_syscall_post_impl_getppid(long res);
+void __sanitizer_syscall_pre_impl_getuid();
+void __sanitizer_syscall_post_impl_getuid(long res);
+void __sanitizer_syscall_pre_impl_geteuid();
+void __sanitizer_syscall_post_impl_geteuid(long res);
+void __sanitizer_syscall_pre_impl_getgid();
+void __sanitizer_syscall_post_impl_getgid(long res);
+void __sanitizer_syscall_pre_impl_getegid();
+void __sanitizer_syscall_post_impl_getegid(long res);
+void __sanitizer_syscall_pre_impl_getresuid(long ruid, long euid, long suid);
+void __sanitizer_syscall_post_impl_getresuid(long res, long ruid, long euid,
+                                             long suid);
+void __sanitizer_syscall_pre_impl_getresgid(long rgid, long egid, long sgid);
+void __sanitizer_syscall_post_impl_getresgid(long res, long rgid, long egid,
+                                             long sgid);
+void __sanitizer_syscall_pre_impl_getpgid(long pid);
+void __sanitizer_syscall_post_impl_getpgid(long res, long pid);
+void __sanitizer_syscall_pre_impl_getpgrp();
+void __sanitizer_syscall_post_impl_getpgrp(long res);
+void __sanitizer_syscall_pre_impl_getsid(long pid);
+void __sanitizer_syscall_post_impl_getsid(long res, long pid);
+void __sanitizer_syscall_pre_impl_getgroups(long gidsetsize, long grouplist);
+void __sanitizer_syscall_post_impl_getgroups(long res, long gidsetsize,
+                                             long grouplist);
+void __sanitizer_syscall_pre_impl_setregid(long rgid, long egid);
+void __sanitizer_syscall_post_impl_setregid(long res, long rgid, long egid);
+void __sanitizer_syscall_pre_impl_setgid(long gid);
+void __sanitizer_syscall_post_impl_setgid(long res, long gid);
+void __sanitizer_syscall_pre_impl_setreuid(long ruid, long euid);
+void __sanitizer_syscall_post_impl_setreuid(long res, long ruid, long euid);
+void __sanitizer_syscall_pre_impl_setuid(long uid);
+void __sanitizer_syscall_post_impl_setuid(long res, long uid);
+void __sanitizer_syscall_pre_impl_setresuid(long ruid, long euid, long suid);
+void __sanitizer_syscall_post_impl_setresuid(long res, long ruid, long euid,
+                                             long suid);
+void __sanitizer_syscall_pre_impl_setresgid(long rgid, long egid, long sgid);
+void __sanitizer_syscall_post_impl_setresgid(long res, long rgid, long egid,
+                                             long sgid);
+void __sanitizer_syscall_pre_impl_setfsuid(long uid);
+void __sanitizer_syscall_post_impl_setfsuid(long res, long uid);
+void __sanitizer_syscall_pre_impl_setfsgid(long gid);
+void __sanitizer_syscall_post_impl_setfsgid(long res, long gid);
+void __sanitizer_syscall_pre_impl_setpgid(long pid, long pgid);
+void __sanitizer_syscall_post_impl_setpgid(long res, long pid, long pgid);
+void __sanitizer_syscall_pre_impl_setsid();
+void __sanitizer_syscall_post_impl_setsid(long res);
+void __sanitizer_syscall_pre_impl_setgroups(long gidsetsize, long grouplist);
+void __sanitizer_syscall_post_impl_setgroups(long res, long gidsetsize,
+                                             long grouplist);
+void __sanitizer_syscall_pre_impl_acct(long name);
+void __sanitizer_syscall_post_impl_acct(long res, long name);
+void __sanitizer_syscall_pre_impl_capget(long header, long dataptr);
+void __sanitizer_syscall_post_impl_capget(long res, long header, long dataptr);
+void __sanitizer_syscall_pre_impl_capset(long header, long data);
+void __sanitizer_syscall_post_impl_capset(long res, long header, long data);
+void __sanitizer_syscall_pre_impl_personality(long personality);
+void __sanitizer_syscall_post_impl_personality(long res, long personality);
+void __sanitizer_syscall_pre_impl_sigpending(long set);
+void __sanitizer_syscall_post_impl_sigpending(long res, long set);
+void __sanitizer_syscall_pre_impl_sigprocmask(long how, long set, long oset);
+void __sanitizer_syscall_post_impl_sigprocmask(long res, long how, long set,
+                                               long oset);
+void __sanitizer_syscall_pre_impl_getitimer(long which, long value);
+void __sanitizer_syscall_post_impl_getitimer(long res, long which, long value);
+void __sanitizer_syscall_pre_impl_setitimer(long which, long value,
+                                            long ovalue);
+void __sanitizer_syscall_post_impl_setitimer(long res, long which, long value,
+                                             long ovalue);
+void __sanitizer_syscall_pre_impl_timer_create(long which_clock,
+                                               long timer_event_spec,
+                                               long created_timer_id);
+void __sanitizer_syscall_post_impl_timer_create(long res, long which_clock,
+                                                long timer_event_spec,
+                                                long created_timer_id);
+void __sanitizer_syscall_pre_impl_timer_gettime(long timer_id, long setting);
+void __sanitizer_syscall_post_impl_timer_gettime(long res, long timer_id,
+                                                 long setting);
+void __sanitizer_syscall_pre_impl_timer_getoverrun(long timer_id);
+void __sanitizer_syscall_post_impl_timer_getoverrun(long res, long timer_id);
+void __sanitizer_syscall_pre_impl_timer_settime(long timer_id, long flags,
+                                                long new_setting,
+                                                long old_setting);
+void __sanitizer_syscall_post_impl_timer_settime(long res, long timer_id,
+                                                 long flags, long new_setting,
+                                                 long old_setting);
+void __sanitizer_syscall_pre_impl_timer_delete(long timer_id);
+void __sanitizer_syscall_post_impl_timer_delete(long res, long timer_id);
+void __sanitizer_syscall_pre_impl_clock_settime(long which_clock, long tp);
+void __sanitizer_syscall_post_impl_clock_settime(long res, long which_clock,
+                                                 long tp);
+void __sanitizer_syscall_pre_impl_clock_gettime(long which_clock, long tp);
+void __sanitizer_syscall_post_impl_clock_gettime(long res, long which_clock,
+                                                 long tp);
+void __sanitizer_syscall_pre_impl_clock_adjtime(long which_clock, long tx);
+void __sanitizer_syscall_post_impl_clock_adjtime(long res, long which_clock,
+                                                 long tx);
+void __sanitizer_syscall_pre_impl_clock_getres(long which_clock, long tp);
+void __sanitizer_syscall_post_impl_clock_getres(long res, long which_clock,
+                                                long tp);
+void __sanitizer_syscall_pre_impl_clock_nanosleep(long which_clock, long flags,
+                                                  long rqtp, long rmtp);
+void __sanitizer_syscall_post_impl_clock_nanosleep(long res, long which_clock,
+                                                   long flags, long rqtp,
+                                                   long rmtp);
+void __sanitizer_syscall_pre_impl_nice(long increment);
+void __sanitizer_syscall_post_impl_nice(long res, long increment);
+void __sanitizer_syscall_pre_impl_sched_setscheduler(long pid, long policy,
+                                                     long param);
+void __sanitizer_syscall_post_impl_sched_setscheduler(long res, long pid,
+                                                      long policy, long param);
+void __sanitizer_syscall_pre_impl_sched_setparam(long pid, long param);
+void __sanitizer_syscall_post_impl_sched_setparam(long res, long pid,
+                                                  long param);
+void __sanitizer_syscall_pre_impl_sched_getscheduler(long pid);
+void __sanitizer_syscall_post_impl_sched_getscheduler(long res, long pid);
+void __sanitizer_syscall_pre_impl_sched_getparam(long pid, long param);
+void __sanitizer_syscall_post_impl_sched_getparam(long res, long pid,
+                                                  long param);
+void __sanitizer_syscall_pre_impl_sched_setaffinity(long pid, long len,
+                                                    long user_mask_ptr);
+void __sanitizer_syscall_post_impl_sched_setaffinity(long res, long pid,
+                                                     long len,
+                                                     long user_mask_ptr);
+void __sanitizer_syscall_pre_impl_sched_getaffinity(long pid, long len,
+                                                    long user_mask_ptr);
+void __sanitizer_syscall_post_impl_sched_getaffinity(long res, long pid,
+                                                     long len,
+                                                     long user_mask_ptr);
+void __sanitizer_syscall_pre_impl_sched_yield();
+void __sanitizer_syscall_post_impl_sched_yield(long res);
+void __sanitizer_syscall_pre_impl_sched_get_priority_max(long policy);
+void __sanitizer_syscall_post_impl_sched_get_priority_max(long res,
+                                                          long policy);
+void __sanitizer_syscall_pre_impl_sched_get_priority_min(long policy);
+void __sanitizer_syscall_post_impl_sched_get_priority_min(long res,
+                                                          long policy);
+void __sanitizer_syscall_pre_impl_sched_rr_get_interval(long pid,
+                                                        long interval);
+void __sanitizer_syscall_post_impl_sched_rr_get_interval(long res, long pid,
+                                                         long interval);
+void __sanitizer_syscall_pre_impl_setpriority(long which, long who,
+                                              long niceval);
+void __sanitizer_syscall_post_impl_setpriority(long res, long which, long who,
+                                               long niceval);
+void __sanitizer_syscall_pre_impl_getpriority(long which, long who);
+void __sanitizer_syscall_post_impl_getpriority(long res, long which, long who);
+void __sanitizer_syscall_pre_impl_shutdown(long arg0, long arg1);
+void __sanitizer_syscall_post_impl_shutdown(long res, long arg0, long arg1);
+void __sanitizer_syscall_pre_impl_reboot(long magic1, long magic2, long cmd,
+                                         long arg);
+void __sanitizer_syscall_post_impl_reboot(long res, long magic1, long magic2,
+                                          long cmd, long arg);
+void __sanitizer_syscall_pre_impl_restart_syscall();
+void __sanitizer_syscall_post_impl_restart_syscall(long res);
+void __sanitizer_syscall_pre_impl_kexec_load(long entry, long nr_segments,
+                                             long segments, long flags);
+void __sanitizer_syscall_post_impl_kexec_load(long res, long entry,
+                                              long nr_segments, long segments,
+                                              long flags);
+void __sanitizer_syscall_pre_impl_exit(long error_code);
+void __sanitizer_syscall_post_impl_exit(long res, long error_code);
+void __sanitizer_syscall_pre_impl_exit_group(long error_code);
+void __sanitizer_syscall_post_impl_exit_group(long res, long error_code);
+void __sanitizer_syscall_pre_impl_wait4(long pid, long stat_addr, long options,
+                                        long ru);
+void __sanitizer_syscall_post_impl_wait4(long res, long pid, long stat_addr,
+                                         long options, long ru);
+void __sanitizer_syscall_pre_impl_waitid(long which, long pid, long infop,
+                                         long options, long ru);
+void __sanitizer_syscall_post_impl_waitid(long res, long which, long pid,
+                                          long infop, long options, long ru);
+void __sanitizer_syscall_pre_impl_waitpid(long pid, long stat_addr,
+                                          long options);
+void __sanitizer_syscall_post_impl_waitpid(long res, long pid, long stat_addr,
+                                           long options);
+void __sanitizer_syscall_pre_impl_set_tid_address(long tidptr);
+void __sanitizer_syscall_post_impl_set_tid_address(long res, long tidptr);
+void __sanitizer_syscall_pre_impl_init_module(long umod, long len, long uargs);
+void __sanitizer_syscall_post_impl_init_module(long res, long umod, long len,
+                                               long uargs);
+void __sanitizer_syscall_pre_impl_delete_module(long name_user, long flags);
+void __sanitizer_syscall_post_impl_delete_module(long res, long name_user,
+                                                 long flags);
+void __sanitizer_syscall_pre_impl_rt_sigprocmask(long how, long set, long oset,
+                                                 long sigsetsize);
+void __sanitizer_syscall_post_impl_rt_sigprocmask(long res, long how, long set,
+                                                  long oset, long sigsetsize);
+void __sanitizer_syscall_pre_impl_rt_sigpending(long set, long sigsetsize);
+void __sanitizer_syscall_post_impl_rt_sigpending(long res, long set,
+                                                 long sigsetsize);
+void __sanitizer_syscall_pre_impl_rt_sigtimedwait(long uthese, long uinfo,
+                                                  long uts, long sigsetsize);
+void __sanitizer_syscall_post_impl_rt_sigtimedwait(long res, long uthese,
+                                                   long uinfo, long uts,
+                                                   long sigsetsize);
+void __sanitizer_syscall_pre_impl_rt_tgsigqueueinfo(long tgid, long pid,
+                                                    long sig, long uinfo);
+void __sanitizer_syscall_post_impl_rt_tgsigqueueinfo(long res, long tgid,
+                                                     long pid, long sig,
+                                                     long uinfo);
+void __sanitizer_syscall_pre_impl_kill(long pid, long sig);
+void __sanitizer_syscall_post_impl_kill(long res, long pid, long sig);
+void __sanitizer_syscall_pre_impl_tgkill(long tgid, long pid, long sig);
+void __sanitizer_syscall_post_impl_tgkill(long res, long tgid, long pid,
+                                          long sig);
+void __sanitizer_syscall_pre_impl_tkill(long pid, long sig);
+void __sanitizer_syscall_post_impl_tkill(long res, long pid, long sig);
+void __sanitizer_syscall_pre_impl_rt_sigqueueinfo(long pid, long sig,
+                                                  long uinfo);
+void __sanitizer_syscall_post_impl_rt_sigqueueinfo(long res, long pid, long sig,
+                                                   long uinfo);
+void __sanitizer_syscall_pre_impl_sgetmask();
+void __sanitizer_syscall_post_impl_sgetmask(long res);
+void __sanitizer_syscall_pre_impl_ssetmask(long newmask);
+void __sanitizer_syscall_post_impl_ssetmask(long res, long newmask);
+void __sanitizer_syscall_pre_impl_signal(long sig, long handler);
+void __sanitizer_syscall_post_impl_signal(long res, long sig, long handler);
+void __sanitizer_syscall_pre_impl_pause();
+void __sanitizer_syscall_post_impl_pause(long res);
+void __sanitizer_syscall_pre_impl_sync();
+void __sanitizer_syscall_post_impl_sync(long res);
+void __sanitizer_syscall_pre_impl_fsync(long fd);
+void __sanitizer_syscall_post_impl_fsync(long res, long fd);
+void __sanitizer_syscall_pre_impl_fdatasync(long fd);
+void __sanitizer_syscall_post_impl_fdatasync(long res, long fd);
+void __sanitizer_syscall_pre_impl_bdflush(long func, long data);
+void __sanitizer_syscall_post_impl_bdflush(long res, long func, long data);
+void __sanitizer_syscall_pre_impl_mount(long dev_name, long dir_name, long type,
+                                        long flags, long data);
+void __sanitizer_syscall_post_impl_mount(long res, long dev_name, long dir_name,
+                                         long type, long flags, long data);
+void __sanitizer_syscall_pre_impl_umount(long name, long flags);
+void __sanitizer_syscall_post_impl_umount(long res, long name, long flags);
+void __sanitizer_syscall_pre_impl_oldumount(long name);
+void __sanitizer_syscall_post_impl_oldumount(long res, long name);
+void __sanitizer_syscall_pre_impl_truncate(long path, long length);
+void __sanitizer_syscall_post_impl_truncate(long res, long path, long length);
+void __sanitizer_syscall_pre_impl_ftruncate(long fd, long length);
+void __sanitizer_syscall_post_impl_ftruncate(long res, long fd, long length);
+void __sanitizer_syscall_pre_impl_stat(long filename, long statbuf);
+void __sanitizer_syscall_post_impl_stat(long res, long filename, long statbuf);
+void __sanitizer_syscall_pre_impl_statfs(long path, long buf);
+void __sanitizer_syscall_post_impl_statfs(long res, long path, long buf);
+void __sanitizer_syscall_pre_impl_statfs64(long path, long sz, long buf);
+void __sanitizer_syscall_post_impl_statfs64(long res, long path, long sz,
+                                            long buf);
+void __sanitizer_syscall_pre_impl_fstatfs(long fd, long buf);
+void __sanitizer_syscall_post_impl_fstatfs(long res, long fd, long buf);
+void __sanitizer_syscall_pre_impl_fstatfs64(long fd, long sz, long buf);
+void __sanitizer_syscall_post_impl_fstatfs64(long res, long fd, long sz,
+                                             long buf);
+void __sanitizer_syscall_pre_impl_lstat(long filename, long statbuf);
+void __sanitizer_syscall_post_impl_lstat(long res, long filename, long statbuf);
+void __sanitizer_syscall_pre_impl_fstat(long fd, long statbuf);
+void __sanitizer_syscall_post_impl_fstat(long res, long fd, long statbuf);
+void __sanitizer_syscall_pre_impl_newstat(long filename, long statbuf);
+void __sanitizer_syscall_post_impl_newstat(long res, long filename,
+                                           long statbuf);
+void __sanitizer_syscall_pre_impl_newlstat(long filename, long statbuf);
+void __sanitizer_syscall_post_impl_newlstat(long res, long filename,
+                                            long statbuf);
+void __sanitizer_syscall_pre_impl_newfstat(long fd, long statbuf);
+void __sanitizer_syscall_post_impl_newfstat(long res, long fd, long statbuf);
+void __sanitizer_syscall_pre_impl_ustat(long dev, long ubuf);
+void __sanitizer_syscall_post_impl_ustat(long res, long dev, long ubuf);
+void __sanitizer_syscall_pre_impl_stat64(long filename, long statbuf);
+void __sanitizer_syscall_post_impl_stat64(long res, long filename,
+                                          long statbuf);
+void __sanitizer_syscall_pre_impl_fstat64(long fd, long statbuf);
+void __sanitizer_syscall_post_impl_fstat64(long res, long fd, long statbuf);
+void __sanitizer_syscall_pre_impl_lstat64(long filename, long statbuf);
+void __sanitizer_syscall_post_impl_lstat64(long res, long filename,
+                                           long statbuf);
+void __sanitizer_syscall_pre_impl_setxattr(long path, long name, long value,
+                                           long size, long flags);
+void __sanitizer_syscall_post_impl_setxattr(long res, long path, long name,
+                                            long value, long size, long flags);
+void __sanitizer_syscall_pre_impl_lsetxattr(long path, long name, long value,
+                                            long size, long flags);
+void __sanitizer_syscall_post_impl_lsetxattr(long res, long path, long name,
+                                             long value, long size, long flags);
+void __sanitizer_syscall_pre_impl_fsetxattr(long fd, long name, long value,
+                                            long size, long flags);
+void __sanitizer_syscall_post_impl_fsetxattr(long res, long fd, long name,
+                                             long value, long size, long flags);
+void __sanitizer_syscall_pre_impl_getxattr(long path, long name, long value,
+                                           long size);
+void __sanitizer_syscall_post_impl_getxattr(long res, long path, long name,
+                                            long value, long size);
+void __sanitizer_syscall_pre_impl_lgetxattr(long path, long name, long value,
+                                            long size);
+void __sanitizer_syscall_post_impl_lgetxattr(long res, long path, long name,
+                                             long value, long size);
+void __sanitizer_syscall_pre_impl_fgetxattr(long fd, long name, long value,
+                                            long size);
+void __sanitizer_syscall_post_impl_fgetxattr(long res, long fd, long name,
+                                             long value, long size);
+void __sanitizer_syscall_pre_impl_listxattr(long path, long list, long size);
+void __sanitizer_syscall_post_impl_listxattr(long res, long path, long list,
+                                             long size);
+void __sanitizer_syscall_pre_impl_llistxattr(long path, long list, long size);
+void __sanitizer_syscall_post_impl_llistxattr(long res, long path, long list,
+                                              long size);
+void __sanitizer_syscall_pre_impl_flistxattr(long fd, long list, long size);
+void __sanitizer_syscall_post_impl_flistxattr(long res, long fd, long list,
+                                              long size);
+void __sanitizer_syscall_pre_impl_removexattr(long path, long name);
+void __sanitizer_syscall_post_impl_removexattr(long res, long path, long name);
+void __sanitizer_syscall_pre_impl_lremovexattr(long path, long name);
+void __sanitizer_syscall_post_impl_lremovexattr(long res, long path, long name);
+void __sanitizer_syscall_pre_impl_fremovexattr(long fd, long name);
+void __sanitizer_syscall_post_impl_fremovexattr(long res, long fd, long name);
+void __sanitizer_syscall_pre_impl_brk(long brk);
+void __sanitizer_syscall_post_impl_brk(long res, long brk);
+void __sanitizer_syscall_pre_impl_mprotect(long start, long len, long prot);
+void __sanitizer_syscall_post_impl_mprotect(long res, long start, long len,
+                                            long prot);
+void __sanitizer_syscall_pre_impl_mremap(long addr, long old_len, long new_len,
+                                         long flags, long new_addr);
+void __sanitizer_syscall_post_impl_mremap(long res, long addr, long old_len,
+                                          long new_len, long flags,
+                                          long new_addr);
+void __sanitizer_syscall_pre_impl_remap_file_pages(long start, long size,
+                                                   long prot, long pgoff,
+                                                   long flags);
+void __sanitizer_syscall_post_impl_remap_file_pages(long res, long start,
+                                                    long size, long prot,
+                                                    long pgoff, long flags);
+void __sanitizer_syscall_pre_impl_msync(long start, long len, long flags);
+void __sanitizer_syscall_post_impl_msync(long res, long start, long len,
+                                         long flags);
+void __sanitizer_syscall_pre_impl_munmap(long addr, long len);
+void __sanitizer_syscall_post_impl_munmap(long res, long addr, long len);
+void __sanitizer_syscall_pre_impl_mlock(long start, long len);
+void __sanitizer_syscall_post_impl_mlock(long res, long start, long len);
+void __sanitizer_syscall_pre_impl_munlock(long start, long len);
+void __sanitizer_syscall_post_impl_munlock(long res, long start, long len);
+void __sanitizer_syscall_pre_impl_mlockall(long flags);
+void __sanitizer_syscall_post_impl_mlockall(long res, long flags);
+void __sanitizer_syscall_pre_impl_munlockall();
+void __sanitizer_syscall_post_impl_munlockall(long res);
+void __sanitizer_syscall_pre_impl_madvise(long start, long len, long behavior);
+void __sanitizer_syscall_post_impl_madvise(long res, long start, long len,
+                                           long behavior);
+void __sanitizer_syscall_pre_impl_mincore(long start, long len, long vec);
+void __sanitizer_syscall_post_impl_mincore(long res, long start, long len,
+                                           long vec);
+void __sanitizer_syscall_pre_impl_pivot_root(long new_root, long put_old);
+void __sanitizer_syscall_post_impl_pivot_root(long res, long new_root,
+                                              long put_old);
+void __sanitizer_syscall_pre_impl_chroot(long filename);
+void __sanitizer_syscall_post_impl_chroot(long res, long filename);
+void __sanitizer_syscall_pre_impl_mknod(long filename, long mode, long dev);
+void __sanitizer_syscall_post_impl_mknod(long res, long filename, long mode,
+                                         long dev);
+void __sanitizer_syscall_pre_impl_link(long oldname, long newname);
+void __sanitizer_syscall_post_impl_link(long res, long oldname, long newname);
+void __sanitizer_syscall_pre_impl_symlink(long old, long new_);
+void __sanitizer_syscall_post_impl_symlink(long res, long old, long new_);
+void __sanitizer_syscall_pre_impl_unlink(long pathname);
+void __sanitizer_syscall_post_impl_unlink(long res, long pathname);
+void __sanitizer_syscall_pre_impl_rename(long oldname, long newname);
+void __sanitizer_syscall_post_impl_rename(long res, long oldname, long newname);
+void __sanitizer_syscall_pre_impl_chmod(long filename, long mode);
+void __sanitizer_syscall_post_impl_chmod(long res, long filename, long mode);
+void __sanitizer_syscall_pre_impl_fchmod(long fd, long mode);
+void __sanitizer_syscall_post_impl_fchmod(long res, long fd, long mode);
+void __sanitizer_syscall_pre_impl_fcntl(long fd, long cmd, long arg);
+void __sanitizer_syscall_post_impl_fcntl(long res, long fd, long cmd, long arg);
+void __sanitizer_syscall_pre_impl_fcntl64(long fd, long cmd, long arg);
+void __sanitizer_syscall_post_impl_fcntl64(long res, long fd, long cmd,
+                                           long arg);
+void __sanitizer_syscall_pre_impl_pipe(long fildes);
+void __sanitizer_syscall_post_impl_pipe(long res, long fildes);
+void __sanitizer_syscall_pre_impl_pipe2(long fildes, long flags);
+void __sanitizer_syscall_post_impl_pipe2(long res, long fildes, long flags);
+void __sanitizer_syscall_pre_impl_dup(long fildes);
+void __sanitizer_syscall_post_impl_dup(long res, long fildes);
+void __sanitizer_syscall_pre_impl_dup2(long oldfd, long newfd);
+void __sanitizer_syscall_post_impl_dup2(long res, long oldfd, long newfd);
+void __sanitizer_syscall_pre_impl_dup3(long oldfd, long newfd, long flags);
+void __sanitizer_syscall_post_impl_dup3(long res, long oldfd, long newfd,
+                                        long flags);
+void __sanitizer_syscall_pre_impl_ioperm(long from, long num, long on);
+void __sanitizer_syscall_post_impl_ioperm(long res, long from, long num,
+                                          long on);
+void __sanitizer_syscall_pre_impl_ioctl(long fd, long cmd, long arg);
+void __sanitizer_syscall_post_impl_ioctl(long res, long fd, long cmd, long arg);
+void __sanitizer_syscall_pre_impl_flock(long fd, long cmd);
+void __sanitizer_syscall_post_impl_flock(long res, long fd, long cmd);
+void __sanitizer_syscall_pre_impl_io_setup(long nr_reqs, long ctx);
+void __sanitizer_syscall_post_impl_io_setup(long res, long nr_reqs, long ctx);
+void __sanitizer_syscall_pre_impl_io_destroy(long ctx);
+void __sanitizer_syscall_post_impl_io_destroy(long res, long ctx);
+void __sanitizer_syscall_pre_impl_io_getevents(long ctx_id, long min_nr,
+                                               long nr, long events,
+                                               long timeout);
+void __sanitizer_syscall_post_impl_io_getevents(long res, long ctx_id,
+                                                long min_nr, long nr,
+                                                long events, long timeout);
+void __sanitizer_syscall_pre_impl_io_submit(long ctx_id, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_io_submit(long res, long ctx_id, long arg1,
+                                             long arg2);
+void __sanitizer_syscall_pre_impl_io_cancel(long ctx_id, long iocb,
+                                            long result);
+void __sanitizer_syscall_post_impl_io_cancel(long res, long ctx_id, long iocb,
+                                             long result);
+void __sanitizer_syscall_pre_impl_sendfile(long out_fd, long in_fd, long offset,
+                                           long count);
+void __sanitizer_syscall_post_impl_sendfile(long res, long out_fd, long in_fd,
+                                            long offset, long count);
+void __sanitizer_syscall_pre_impl_sendfile64(long out_fd, long in_fd,
+                                             long offset, long count);
+void __sanitizer_syscall_post_impl_sendfile64(long res, long out_fd, long in_fd,
+                                              long offset, long count);
+void __sanitizer_syscall_pre_impl_readlink(long path, long buf, long bufsiz);
+void __sanitizer_syscall_post_impl_readlink(long res, long path, long buf,
+                                            long bufsiz);
+void __sanitizer_syscall_pre_impl_creat(long pathname, long mode);
+void __sanitizer_syscall_post_impl_creat(long res, long pathname, long mode);
+void __sanitizer_syscall_pre_impl_open(long filename, long flags, long mode);
+void __sanitizer_syscall_post_impl_open(long res, long filename, long flags,
+                                        long mode);
+void __sanitizer_syscall_pre_impl_close(long fd);
+void __sanitizer_syscall_post_impl_close(long res, long fd);
+void __sanitizer_syscall_pre_impl_access(long filename, long mode);
+void __sanitizer_syscall_post_impl_access(long res, long filename, long mode);
+void __sanitizer_syscall_pre_impl_vhangup();
+void __sanitizer_syscall_post_impl_vhangup(long res);
+void __sanitizer_syscall_pre_impl_chown(long filename, long user, long group);
+void __sanitizer_syscall_post_impl_chown(long res, long filename, long user,
+                                         long group);
+void __sanitizer_syscall_pre_impl_lchown(long filename, long user, long group);
+void __sanitizer_syscall_post_impl_lchown(long res, long filename, long user,
+                                          long group);
+void __sanitizer_syscall_pre_impl_fchown(long fd, long user, long group);
+void __sanitizer_syscall_post_impl_fchown(long res, long fd, long user,
+                                          long group);
+void __sanitizer_syscall_pre_impl_chown16(long filename, long user, long group);
+void __sanitizer_syscall_post_impl_chown16(long res, long filename, long user,
+                                           long group);
+void __sanitizer_syscall_pre_impl_lchown16(long filename, long user,
+                                           long group);
+void __sanitizer_syscall_post_impl_lchown16(long res, long filename, long user,
+                                            long group);
+void __sanitizer_syscall_pre_impl_fchown16(long fd, long user, long group);
+void __sanitizer_syscall_post_impl_fchown16(long res, long fd, long user,
+                                            long group);
+void __sanitizer_syscall_pre_impl_setregid16(long rgid, long egid);
+void __sanitizer_syscall_post_impl_setregid16(long res, long rgid, long egid);
+void __sanitizer_syscall_pre_impl_setgid16(long gid);
+void __sanitizer_syscall_post_impl_setgid16(long res, long gid);
+void __sanitizer_syscall_pre_impl_setreuid16(long ruid, long euid);
+void __sanitizer_syscall_post_impl_setreuid16(long res, long ruid, long euid);
+void __sanitizer_syscall_pre_impl_setuid16(long uid);
+void __sanitizer_syscall_post_impl_setuid16(long res, long uid);
+void __sanitizer_syscall_pre_impl_setresuid16(long ruid, long euid, long suid);
+void __sanitizer_syscall_post_impl_setresuid16(long res, long ruid, long euid,
+                                               long suid);
+void __sanitizer_syscall_pre_impl_getresuid16(long ruid, long euid, long suid);
+void __sanitizer_syscall_post_impl_getresuid16(long res, long ruid, long euid,
+                                               long suid);
+void __sanitizer_syscall_pre_impl_setresgid16(long rgid, long egid, long sgid);
+void __sanitizer_syscall_post_impl_setresgid16(long res, long rgid, long egid,
+                                               long sgid);
+void __sanitizer_syscall_pre_impl_getresgid16(long rgid, long egid, long sgid);
+void __sanitizer_syscall_post_impl_getresgid16(long res, long rgid, long egid,
+                                               long sgid);
+void __sanitizer_syscall_pre_impl_setfsuid16(long uid);
+void __sanitizer_syscall_post_impl_setfsuid16(long res, long uid);
+void __sanitizer_syscall_pre_impl_setfsgid16(long gid);
+void __sanitizer_syscall_post_impl_setfsgid16(long res, long gid);
+void __sanitizer_syscall_pre_impl_getgroups16(long gidsetsize, long grouplist);
+void __sanitizer_syscall_post_impl_getgroups16(long res, long gidsetsize,
+                                               long grouplist);
+void __sanitizer_syscall_pre_impl_setgroups16(long gidsetsize, long grouplist);
+void __sanitizer_syscall_post_impl_setgroups16(long res, long gidsetsize,
+                                               long grouplist);
+void __sanitizer_syscall_pre_impl_getuid16();
+void __sanitizer_syscall_post_impl_getuid16(long res);
+void __sanitizer_syscall_pre_impl_geteuid16();
+void __sanitizer_syscall_post_impl_geteuid16(long res);
+void __sanitizer_syscall_pre_impl_getgid16();
+void __sanitizer_syscall_post_impl_getgid16(long res);
+void __sanitizer_syscall_pre_impl_getegid16();
+void __sanitizer_syscall_post_impl_getegid16(long res);
+void __sanitizer_syscall_pre_impl_utime(long filename, long times);
+void __sanitizer_syscall_post_impl_utime(long res, long filename, long times);
+void __sanitizer_syscall_pre_impl_utimes(long filename, long utimes);
+void __sanitizer_syscall_post_impl_utimes(long res, long filename, long utimes);
+void __sanitizer_syscall_pre_impl_lseek(long fd, long offset, long origin);
+void __sanitizer_syscall_post_impl_lseek(long res, long fd, long offset,
+                                         long origin);
+void __sanitizer_syscall_pre_impl_llseek(long fd, long offset_high,
+                                         long offset_low, long result,
+                                         long origin);
+void __sanitizer_syscall_post_impl_llseek(long res, long fd, long offset_high,
+                                          long offset_low, long result,
+                                          long origin);
+void __sanitizer_syscall_pre_impl_read(long fd, long buf, long count);
+void __sanitizer_syscall_post_impl_read(long res, long fd, long buf,
+                                        long count);
+void __sanitizer_syscall_pre_impl_readv(long fd, long vec, long vlen);
+void __sanitizer_syscall_post_impl_readv(long res, long fd, long vec,
+                                         long vlen);
+void __sanitizer_syscall_pre_impl_write(long fd, long buf, long count);
+void __sanitizer_syscall_post_impl_write(long res, long fd, long buf,
+                                         long count);
+void __sanitizer_syscall_pre_impl_writev(long fd, long vec, long vlen);
+void __sanitizer_syscall_post_impl_writev(long res, long fd, long vec,
+                                          long vlen);
+
+#ifdef _LP64
+void __sanitizer_syscall_pre_impl_pread64(long fd, long buf, long count,
+                                          long pos);
+void __sanitizer_syscall_post_impl_pread64(long res, long fd, long buf,
+                                           long count, long pos);
+void __sanitizer_syscall_pre_impl_pwrite64(long fd, long buf, long count,
+                                           long pos);
+void __sanitizer_syscall_post_impl_pwrite64(long res, long fd, long buf,
+                                            long count, long pos);
+#else
+void __sanitizer_syscall_pre_impl_pread64(long fd, long buf, long count,
+                                          long pos0, long pos1);
+void __sanitizer_syscall_post_impl_pread64(long res, long fd, long buf,
+                                           long count, long pos0, long pos1);
+void __sanitizer_syscall_pre_impl_pwrite64(long fd, long buf, long count,
+                                           long pos0, long pos1);
+void __sanitizer_syscall_post_impl_pwrite64(long res, long fd, long buf,
+                                            long count, long pos0, long pos1);
+#endif
+
+void __sanitizer_syscall_pre_impl_preadv(long fd, long vec, long vlen,
+                                         long pos_l, long pos_h);
+void __sanitizer_syscall_post_impl_preadv(long res, long fd, long vec,
+                                          long vlen, long pos_l, long pos_h);
+void __sanitizer_syscall_pre_impl_pwritev(long fd, long vec, long vlen,
+                                          long pos_l, long pos_h);
+void __sanitizer_syscall_post_impl_pwritev(long res, long fd, long vec,
+                                           long vlen, long pos_l, long pos_h);
+void __sanitizer_syscall_pre_impl_getcwd(long buf, long size);
+void __sanitizer_syscall_post_impl_getcwd(long res, long buf, long size);
+void __sanitizer_syscall_pre_impl_mkdir(long pathname, long mode);
+void __sanitizer_syscall_post_impl_mkdir(long res, long pathname, long mode);
+void __sanitizer_syscall_pre_impl_chdir(long filename);
+void __sanitizer_syscall_post_impl_chdir(long res, long filename);
+void __sanitizer_syscall_pre_impl_fchdir(long fd);
+void __sanitizer_syscall_post_impl_fchdir(long res, long fd);
+void __sanitizer_syscall_pre_impl_rmdir(long pathname);
+void __sanitizer_syscall_post_impl_rmdir(long res, long pathname);
+void __sanitizer_syscall_pre_impl_lookup_dcookie(long cookie64, long buf,
+                                                 long len);
+void __sanitizer_syscall_post_impl_lookup_dcookie(long res, long cookie64,
+                                                  long buf, long len);
+void __sanitizer_syscall_pre_impl_quotactl(long cmd, long special, long id,
+                                           long addr);
+void __sanitizer_syscall_post_impl_quotactl(long res, long cmd, long special,
+                                            long id, long addr);
+void __sanitizer_syscall_pre_impl_getdents(long fd, long dirent, long count);
+void __sanitizer_syscall_post_impl_getdents(long res, long fd, long dirent,
+                                            long count);
+void __sanitizer_syscall_pre_impl_getdents64(long fd, long dirent, long count);
+void __sanitizer_syscall_post_impl_getdents64(long res, long fd, long dirent,
+                                              long count);
+void __sanitizer_syscall_pre_impl_setsockopt(long fd, long level, long optname,
+                                             long optval, long optlen);
+void __sanitizer_syscall_post_impl_setsockopt(long res, long fd, long level,
+                                              long optname, long optval,
+                                              long optlen);
+void __sanitizer_syscall_pre_impl_getsockopt(long fd, long level, long optname,
+                                             long optval, long optlen);
+void __sanitizer_syscall_post_impl_getsockopt(long res, long fd, long level,
+                                              long optname, long optval,
+                                              long optlen);
+void __sanitizer_syscall_pre_impl_bind(long arg0, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_bind(long res, long arg0, long arg1,
+                                        long arg2);
+void __sanitizer_syscall_pre_impl_connect(long arg0, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_connect(long res, long arg0, long arg1,
+                                           long arg2);
+void __sanitizer_syscall_pre_impl_accept(long arg0, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_accept(long res, long arg0, long arg1,
+                                          long arg2);
+void __sanitizer_syscall_pre_impl_accept4(long arg0, long arg1, long arg2,
+                                          long arg3);
+void __sanitizer_syscall_post_impl_accept4(long res, long arg0, long arg1,
+                                           long arg2, long arg3);
+void __sanitizer_syscall_pre_impl_getsockname(long arg0, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_getsockname(long res, long arg0, long arg1,
+                                               long arg2);
+void __sanitizer_syscall_pre_impl_getpeername(long arg0, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_getpeername(long res, long arg0, long arg1,
+                                               long arg2);
+void __sanitizer_syscall_pre_impl_send(long arg0, long arg1, long arg2,
+                                       long arg3);
+void __sanitizer_syscall_post_impl_send(long res, long arg0, long arg1,
+                                        long arg2, long arg3);
+void __sanitizer_syscall_pre_impl_sendto(long arg0, long arg1, long arg2,
+                                         long arg3, long arg4, long arg5);
+void __sanitizer_syscall_post_impl_sendto(long res, long arg0, long arg1,
+                                          long arg2, long arg3, long arg4,
+                                          long arg5);
+void __sanitizer_syscall_pre_impl_sendmsg(long fd, long msg, long flags);
+void __sanitizer_syscall_post_impl_sendmsg(long res, long fd, long msg,
+                                           long flags);
+void __sanitizer_syscall_pre_impl_sendmmsg(long fd, long msg, long vlen,
+                                           long flags);
+void __sanitizer_syscall_post_impl_sendmmsg(long res, long fd, long msg,
+                                            long vlen, long flags);
+void __sanitizer_syscall_pre_impl_recv(long arg0, long arg1, long arg2,
+                                       long arg3);
+void __sanitizer_syscall_post_impl_recv(long res, long arg0, long arg1,
+                                        long arg2, long arg3);
+void __sanitizer_syscall_pre_impl_recvfrom(long arg0, long arg1, long arg2,
+                                           long arg3, long arg4, long arg5);
+void __sanitizer_syscall_post_impl_recvfrom(long res, long arg0, long arg1,
+                                            long arg2, long arg3, long arg4,
+                                            long arg5);
+void __sanitizer_syscall_pre_impl_recvmsg(long fd, long msg, long flags);
+void __sanitizer_syscall_post_impl_recvmsg(long res, long fd, long msg,
+                                           long flags);
+void __sanitizer_syscall_pre_impl_recvmmsg(long fd, long msg, long vlen,
+                                           long flags, long timeout);
+void __sanitizer_syscall_post_impl_recvmmsg(long res, long fd, long msg,
+                                            long vlen, long flags,
+                                            long timeout);
+void __sanitizer_syscall_pre_impl_socket(long arg0, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_socket(long res, long arg0, long arg1,
+                                          long arg2);
+void __sanitizer_syscall_pre_impl_socketpair(long arg0, long arg1, long arg2,
+                                             long arg3);
+void __sanitizer_syscall_post_impl_socketpair(long res, long arg0, long arg1,
+                                              long arg2, long arg3);
+void __sanitizer_syscall_pre_impl_socketcall(long call, long args);
+void __sanitizer_syscall_post_impl_socketcall(long res, long call, long args);
+void __sanitizer_syscall_pre_impl_listen(long arg0, long arg1);
+void __sanitizer_syscall_post_impl_listen(long res, long arg0, long arg1);
+void __sanitizer_syscall_pre_impl_poll(long ufds, long nfds, long timeout);
+void __sanitizer_syscall_post_impl_poll(long res, long ufds, long nfds,
+                                        long timeout);
+void __sanitizer_syscall_pre_impl_select(long n, long inp, long outp, long exp,
+                                         long tvp);
+void __sanitizer_syscall_post_impl_select(long res, long n, long inp, long outp,
+                                          long exp, long tvp);
+void __sanitizer_syscall_pre_impl_old_select(long arg);
+void __sanitizer_syscall_post_impl_old_select(long res, long arg);
+void __sanitizer_syscall_pre_impl_epoll_create(long size);
+void __sanitizer_syscall_post_impl_epoll_create(long res, long size);
+void __sanitizer_syscall_pre_impl_epoll_create1(long flags);
+void __sanitizer_syscall_post_impl_epoll_create1(long res, long flags);
+void __sanitizer_syscall_pre_impl_epoll_ctl(long epfd, long op, long fd,
+                                            long event);
+void __sanitizer_syscall_post_impl_epoll_ctl(long res, long epfd, long op,
+                                             long fd, long event);
+void __sanitizer_syscall_pre_impl_epoll_wait(long epfd, long events,
+                                             long maxevents, long timeout);
+void __sanitizer_syscall_post_impl_epoll_wait(long res, long epfd, long events,
+                                              long maxevents, long timeout);
+void __sanitizer_syscall_pre_impl_epoll_pwait(long epfd, long events,
+                                              long maxevents, long timeout,
+                                              long sigmask, long sigsetsize);
+void __sanitizer_syscall_post_impl_epoll_pwait(long res, long epfd, long events,
+                                               long maxevents, long timeout,
+                                               long sigmask, long sigsetsize);
+void __sanitizer_syscall_pre_impl_gethostname(long name, long len);
+void __sanitizer_syscall_post_impl_gethostname(long res, long name, long len);
+void __sanitizer_syscall_pre_impl_sethostname(long name, long len);
+void __sanitizer_syscall_post_impl_sethostname(long res, long name, long len);
+void __sanitizer_syscall_pre_impl_setdomainname(long name, long len);
+void __sanitizer_syscall_post_impl_setdomainname(long res, long name, long len);
+void __sanitizer_syscall_pre_impl_newuname(long name);
+void __sanitizer_syscall_post_impl_newuname(long res, long name);
+void __sanitizer_syscall_pre_impl_uname(long arg0);
+void __sanitizer_syscall_post_impl_uname(long res, long arg0);
+void __sanitizer_syscall_pre_impl_olduname(long arg0);
+void __sanitizer_syscall_post_impl_olduname(long res, long arg0);
+void __sanitizer_syscall_pre_impl_getrlimit(long resource, long rlim);
+void __sanitizer_syscall_post_impl_getrlimit(long res, long resource,
+                                             long rlim);
+void __sanitizer_syscall_pre_impl_old_getrlimit(long resource, long rlim);
+void __sanitizer_syscall_post_impl_old_getrlimit(long res, long resource,
+                                                 long rlim);
+void __sanitizer_syscall_pre_impl_setrlimit(long resource, long rlim);
+void __sanitizer_syscall_post_impl_setrlimit(long res, long resource,
+                                             long rlim);
+void __sanitizer_syscall_pre_impl_prlimit64(long pid, long resource,
+                                            long new_rlim, long old_rlim);
+void __sanitizer_syscall_post_impl_prlimit64(long res, long pid, long resource,
+                                             long new_rlim, long old_rlim);
+void __sanitizer_syscall_pre_impl_getrusage(long who, long ru);
+void __sanitizer_syscall_post_impl_getrusage(long res, long who, long ru);
+void __sanitizer_syscall_pre_impl_umask(long mask);
+void __sanitizer_syscall_post_impl_umask(long res, long mask);
+void __sanitizer_syscall_pre_impl_msgget(long key, long msgflg);
+void __sanitizer_syscall_post_impl_msgget(long res, long key, long msgflg);
+void __sanitizer_syscall_pre_impl_msgsnd(long msqid, long msgp, long msgsz,
+                                         long msgflg);
+void __sanitizer_syscall_post_impl_msgsnd(long res, long msqid, long msgp,
+                                          long msgsz, long msgflg);
+void __sanitizer_syscall_pre_impl_msgrcv(long msqid, long msgp, long msgsz,
+                                         long msgtyp, long msgflg);
+void __sanitizer_syscall_post_impl_msgrcv(long res, long msqid, long msgp,
+                                          long msgsz, long msgtyp, long msgflg);
+void __sanitizer_syscall_pre_impl_msgctl(long msqid, long cmd, long buf);
+void __sanitizer_syscall_post_impl_msgctl(long res, long msqid, long cmd,
+                                          long buf);
+void __sanitizer_syscall_pre_impl_semget(long key, long nsems, long semflg);
+void __sanitizer_syscall_post_impl_semget(long res, long key, long nsems,
+                                          long semflg);
+void __sanitizer_syscall_pre_impl_semop(long semid, long sops, long nsops);
+void __sanitizer_syscall_post_impl_semop(long res, long semid, long sops,
+                                         long nsops);
+void __sanitizer_syscall_pre_impl_semctl(long semid, long semnum, long cmd,
+                                         long arg);
+void __sanitizer_syscall_post_impl_semctl(long res, long semid, long semnum,
+                                          long cmd, long arg);
+void __sanitizer_syscall_pre_impl_semtimedop(long semid, long sops, long nsops,
+                                             long timeout);
+void __sanitizer_syscall_post_impl_semtimedop(long res, long semid, long sops,
+                                              long nsops, long timeout);
+void __sanitizer_syscall_pre_impl_shmat(long shmid, long shmaddr, long shmflg);
+void __sanitizer_syscall_post_impl_shmat(long res, long shmid, long shmaddr,
+                                         long shmflg);
+void __sanitizer_syscall_pre_impl_shmget(long key, long size, long flag);
+void __sanitizer_syscall_post_impl_shmget(long res, long key, long size,
+                                          long flag);
+void __sanitizer_syscall_pre_impl_shmdt(long shmaddr);
+void __sanitizer_syscall_post_impl_shmdt(long res, long shmaddr);
+void __sanitizer_syscall_pre_impl_shmctl(long shmid, long cmd, long buf);
+void __sanitizer_syscall_post_impl_shmctl(long res, long shmid, long cmd,
+                                          long buf);
+void __sanitizer_syscall_pre_impl_ipc(long call, long first, long second,
+                                      long third, long ptr, long fifth);
+void __sanitizer_syscall_post_impl_ipc(long res, long call, long first,
+                                       long second, long third, long ptr,
+                                       long fifth);
+void __sanitizer_syscall_pre_impl_mq_open(long name, long oflag, long mode,
+                                          long attr);
+void __sanitizer_syscall_post_impl_mq_open(long res, long name, long oflag,
+                                           long mode, long attr);
+void __sanitizer_syscall_pre_impl_mq_unlink(long name);
+void __sanitizer_syscall_post_impl_mq_unlink(long res, long name);
+void __sanitizer_syscall_pre_impl_mq_timedsend(long mqdes, long msg_ptr,
+                                               long msg_len, long msg_prio,
+                                               long abs_timeout);
+void __sanitizer_syscall_post_impl_mq_timedsend(long res, long mqdes,
+                                                long msg_ptr, long msg_len,
+                                                long msg_prio,
+                                                long abs_timeout);
+void __sanitizer_syscall_pre_impl_mq_timedreceive(long mqdes, long msg_ptr,
+                                                  long msg_len, long msg_prio,
+                                                  long abs_timeout);
+void __sanitizer_syscall_post_impl_mq_timedreceive(long res, long mqdes,
+                                                   long msg_ptr, long msg_len,
+                                                   long msg_prio,
+                                                   long abs_timeout);
+void __sanitizer_syscall_pre_impl_mq_notify(long mqdes, long notification);
+void __sanitizer_syscall_post_impl_mq_notify(long res, long mqdes,
+                                             long notification);
+void __sanitizer_syscall_pre_impl_mq_getsetattr(long mqdes, long mqstat,
+                                                long omqstat);
+void __sanitizer_syscall_post_impl_mq_getsetattr(long res, long mqdes,
+                                                 long mqstat, long omqstat);
+void __sanitizer_syscall_pre_impl_pciconfig_iobase(long which, long bus,
+                                                   long devfn);
+void __sanitizer_syscall_post_impl_pciconfig_iobase(long res, long which,
+                                                    long bus, long devfn);
+void __sanitizer_syscall_pre_impl_pciconfig_read(long bus, long dfn, long off,
+                                                 long len, long buf);
+void __sanitizer_syscall_post_impl_pciconfig_read(long res, long bus, long dfn,
+                                                  long off, long len, long buf);
+void __sanitizer_syscall_pre_impl_pciconfig_write(long bus, long dfn, long off,
+                                                  long len, long buf);
+void __sanitizer_syscall_post_impl_pciconfig_write(long res, long bus, long dfn,
+                                                   long off, long len,
+                                                   long buf);
+void __sanitizer_syscall_pre_impl_swapon(long specialfile, long swap_flags);
+void __sanitizer_syscall_post_impl_swapon(long res, long specialfile,
+                                          long swap_flags);
+void __sanitizer_syscall_pre_impl_swapoff(long specialfile);
+void __sanitizer_syscall_post_impl_swapoff(long res, long specialfile);
+void __sanitizer_syscall_pre_impl_sysctl(long args);
+void __sanitizer_syscall_post_impl_sysctl(long res, long args);
+void __sanitizer_syscall_pre_impl_sysinfo(long info);
+void __sanitizer_syscall_post_impl_sysinfo(long res, long info);
+void __sanitizer_syscall_pre_impl_sysfs(long option, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_sysfs(long res, long option, long arg1,
+                                         long arg2);
+void __sanitizer_syscall_pre_impl_syslog(long type, long buf, long len);
+void __sanitizer_syscall_post_impl_syslog(long res, long type, long buf,
+                                          long len);
+void __sanitizer_syscall_pre_impl_uselib(long library);
+void __sanitizer_syscall_post_impl_uselib(long res, long library);
+void __sanitizer_syscall_pre_impl_ni_syscall();
+void __sanitizer_syscall_post_impl_ni_syscall(long res);
+void __sanitizer_syscall_pre_impl_ptrace(long request, long pid, long addr,
+                                         long data);
+void __sanitizer_syscall_post_impl_ptrace(long res, long request, long pid,
+                                          long addr, long data);
+void __sanitizer_syscall_pre_impl_add_key(long _type, long _description,
+                                          long _payload, long plen,
+                                          long destringid);
+void __sanitizer_syscall_post_impl_add_key(long res, long _type,
+                                           long _description, long _payload,
+                                           long plen, long destringid);
+void __sanitizer_syscall_pre_impl_request_key(long _type, long _description,
+                                              long _callout_info,
+                                              long destringid);
+void __sanitizer_syscall_post_impl_request_key(long res, long _type,
+                                               long _description,
+                                               long _callout_info,
+                                               long destringid);
+void __sanitizer_syscall_pre_impl_keyctl(long cmd, long arg2, long arg3,
+                                         long arg4, long arg5);
+void __sanitizer_syscall_post_impl_keyctl(long res, long cmd, long arg2,
+                                          long arg3, long arg4, long arg5);
+void __sanitizer_syscall_pre_impl_ioprio_set(long which, long who, long ioprio);
+void __sanitizer_syscall_post_impl_ioprio_set(long res, long which, long who,
+                                              long ioprio);
+void __sanitizer_syscall_pre_impl_ioprio_get(long which, long who);
+void __sanitizer_syscall_post_impl_ioprio_get(long res, long which, long who);
+void __sanitizer_syscall_pre_impl_set_mempolicy(long mode, long nmask,
+                                                long maxnode);
+void __sanitizer_syscall_post_impl_set_mempolicy(long res, long mode,
+                                                 long nmask, long maxnode);
+void __sanitizer_syscall_pre_impl_migrate_pages(long pid, long maxnode,
+                                                long from, long to);
+void __sanitizer_syscall_post_impl_migrate_pages(long res, long pid,
+                                                 long maxnode, long from,
+                                                 long to);
+void __sanitizer_syscall_pre_impl_move_pages(long pid, long nr_pages,
+                                             long pages, long nodes,
+                                             long status, long flags);
+void __sanitizer_syscall_post_impl_move_pages(long res, long pid, long nr_pages,
+                                              long pages, long nodes,
+                                              long status, long flags);
+void __sanitizer_syscall_pre_impl_mbind(long start, long len, long mode,
+                                        long nmask, long maxnode, long flags);
+void __sanitizer_syscall_post_impl_mbind(long res, long start, long len,
+                                         long mode, long nmask, long maxnode,
+                                         long flags);
+void __sanitizer_syscall_pre_impl_get_mempolicy(long policy, long nmask,
+                                                long maxnode, long addr,
+                                                long flags);
+void __sanitizer_syscall_post_impl_get_mempolicy(long res, long policy,
+                                                 long nmask, long maxnode,
+                                                 long addr, long flags);
+void __sanitizer_syscall_pre_impl_inotify_init();
+void __sanitizer_syscall_post_impl_inotify_init(long res);
+void __sanitizer_syscall_pre_impl_inotify_init1(long flags);
+void __sanitizer_syscall_post_impl_inotify_init1(long res, long flags);
+void __sanitizer_syscall_pre_impl_inotify_add_watch(long fd, long path,
+                                                    long mask);
+void __sanitizer_syscall_post_impl_inotify_add_watch(long res, long fd,
+                                                     long path, long mask);
+void __sanitizer_syscall_pre_impl_inotify_rm_watch(long fd, long wd);
+void __sanitizer_syscall_post_impl_inotify_rm_watch(long res, long fd, long wd);
+void __sanitizer_syscall_pre_impl_spu_run(long fd, long unpc, long ustatus);
+void __sanitizer_syscall_post_impl_spu_run(long res, long fd, long unpc,
+                                           long ustatus);
+void __sanitizer_syscall_pre_impl_spu_create(long name, long flags, long mode,
+                                             long fd);
+void __sanitizer_syscall_post_impl_spu_create(long res, long name, long flags,
+                                              long mode, long fd);
+void __sanitizer_syscall_pre_impl_mknodat(long dfd, long filename, long mode,
+                                          long dev);
+void __sanitizer_syscall_post_impl_mknodat(long res, long dfd, long filename,
+                                           long mode, long dev);
+void __sanitizer_syscall_pre_impl_mkdirat(long dfd, long pathname, long mode);
+void __sanitizer_syscall_post_impl_mkdirat(long res, long dfd, long pathname,
+                                           long mode);
+void __sanitizer_syscall_pre_impl_unlinkat(long dfd, long pathname, long flag);
+void __sanitizer_syscall_post_impl_unlinkat(long res, long dfd, long pathname,
+                                            long flag);
+void __sanitizer_syscall_pre_impl_symlinkat(long oldname, long newdfd,
+                                            long newname);
+void __sanitizer_syscall_post_impl_symlinkat(long res, long oldname,
+                                             long newdfd, long newname);
+void __sanitizer_syscall_pre_impl_linkat(long olddfd, long oldname, long newdfd,
+                                         long newname, long flags);
+void __sanitizer_syscall_post_impl_linkat(long res, long olddfd, long oldname,
+                                          long newdfd, long newname,
+                                          long flags);
+void __sanitizer_syscall_pre_impl_renameat(long olddfd, long oldname,
+                                           long newdfd, long newname);
+void __sanitizer_syscall_post_impl_renameat(long res, long olddfd, long oldname,
+                                            long newdfd, long newname);
+void __sanitizer_syscall_pre_impl_futimesat(long dfd, long filename,
+                                            long utimes);
+void __sanitizer_syscall_post_impl_futimesat(long res, long dfd, long filename,
+                                             long utimes);
+void __sanitizer_syscall_pre_impl_faccessat(long dfd, long filename, long mode);
+void __sanitizer_syscall_post_impl_faccessat(long res, long dfd, long filename,
+                                             long mode);
+void __sanitizer_syscall_pre_impl_fchmodat(long dfd, long filename, long mode);
+void __sanitizer_syscall_post_impl_fchmodat(long res, long dfd, long filename,
+                                            long mode);
+void __sanitizer_syscall_pre_impl_fchownat(long dfd, long filename, long user,
+                                           long group, long flag);
+void __sanitizer_syscall_post_impl_fchownat(long res, long dfd, long filename,
+                                            long user, long group, long flag);
+void __sanitizer_syscall_pre_impl_openat(long dfd, long filename, long flags,
+                                         long mode);
+void __sanitizer_syscall_post_impl_openat(long res, long dfd, long filename,
+                                          long flags, long mode);
+void __sanitizer_syscall_pre_impl_newfstatat(long dfd, long filename,
+                                             long statbuf, long flag);
+void __sanitizer_syscall_post_impl_newfstatat(long res, long dfd, long filename,
+                                              long statbuf, long flag);
+void __sanitizer_syscall_pre_impl_fstatat64(long dfd, long filename,
+                                            long statbuf, long flag);
+void __sanitizer_syscall_post_impl_fstatat64(long res, long dfd, long filename,
+                                             long statbuf, long flag);
+void __sanitizer_syscall_pre_impl_readlinkat(long dfd, long path, long buf,
+                                             long bufsiz);
+void __sanitizer_syscall_post_impl_readlinkat(long res, long dfd, long path,
+                                              long buf, long bufsiz);
+void __sanitizer_syscall_pre_impl_utimensat(long dfd, long filename,
+                                            long utimes, long flags);
+void __sanitizer_syscall_post_impl_utimensat(long res, long dfd, long filename,
+                                             long utimes, long flags);
+void __sanitizer_syscall_pre_impl_unshare(long unshare_flags);
+void __sanitizer_syscall_post_impl_unshare(long res, long unshare_flags);
+void __sanitizer_syscall_pre_impl_splice(long fd_in, long off_in, long fd_out,
+                                         long off_out, long len, long flags);
+void __sanitizer_syscall_post_impl_splice(long res, long fd_in, long off_in,
+                                          long fd_out, long off_out, long len,
+                                          long flags);
+void __sanitizer_syscall_pre_impl_vmsplice(long fd, long iov, long nr_segs,
+                                           long flags);
+void __sanitizer_syscall_post_impl_vmsplice(long res, long fd, long iov,
+                                            long nr_segs, long flags);
+void __sanitizer_syscall_pre_impl_tee(long fdin, long fdout, long len,
+                                      long flags);
+void __sanitizer_syscall_post_impl_tee(long res, long fdin, long fdout,
+                                       long len, long flags);
+void __sanitizer_syscall_pre_impl_get_robust_list(long pid, long head_ptr,
+                                                  long len_ptr);
+void __sanitizer_syscall_post_impl_get_robust_list(long res, long pid,
+                                                   long head_ptr, long len_ptr);
+void __sanitizer_syscall_pre_impl_set_robust_list(long head, long len);
+void __sanitizer_syscall_post_impl_set_robust_list(long res, long head,
+                                                   long len);
+void __sanitizer_syscall_pre_impl_getcpu(long cpu, long node, long cache);
+void __sanitizer_syscall_post_impl_getcpu(long res, long cpu, long node,
+                                          long cache);
+void __sanitizer_syscall_pre_impl_signalfd(long ufd, long user_mask,
+                                           long sizemask);
+void __sanitizer_syscall_post_impl_signalfd(long res, long ufd, long user_mask,
+                                            long sizemask);
+void __sanitizer_syscall_pre_impl_signalfd4(long ufd, long user_mask,
+                                            long sizemask, long flags);
+void __sanitizer_syscall_post_impl_signalfd4(long res, long ufd, long user_mask,
+                                             long sizemask, long flags);
+void __sanitizer_syscall_pre_impl_timerfd_create(long clockid, long flags);
+void __sanitizer_syscall_post_impl_timerfd_create(long res, long clockid,
+                                                  long flags);
+void __sanitizer_syscall_pre_impl_timerfd_settime(long ufd, long flags,
+                                                  long utmr, long otmr);
+void __sanitizer_syscall_post_impl_timerfd_settime(long res, long ufd,
+                                                   long flags, long utmr,
+                                                   long otmr);
+void __sanitizer_syscall_pre_impl_timerfd_gettime(long ufd, long otmr);
+void __sanitizer_syscall_post_impl_timerfd_gettime(long res, long ufd,
+                                                   long otmr);
+void __sanitizer_syscall_pre_impl_eventfd(long count);
+void __sanitizer_syscall_post_impl_eventfd(long res, long count);
+void __sanitizer_syscall_pre_impl_eventfd2(long count, long flags);
+void __sanitizer_syscall_post_impl_eventfd2(long res, long count, long flags);
+void __sanitizer_syscall_pre_impl_old_readdir(long arg0, long arg1, long arg2);
+void __sanitizer_syscall_post_impl_old_readdir(long res, long arg0, long arg1,
+                                               long arg2);
+void __sanitizer_syscall_pre_impl_pselect6(long arg0, long arg1, long arg2,
+                                           long arg3, long arg4, long arg5);
+void __sanitizer_syscall_post_impl_pselect6(long res, long arg0, long arg1,
+                                            long arg2, long arg3, long arg4,
+                                            long arg5);
+void __sanitizer_syscall_pre_impl_ppoll(long arg0, long arg1, long arg2,
+                                        long arg3, long arg4);
+void __sanitizer_syscall_post_impl_ppoll(long res, long arg0, long arg1,
+                                         long arg2, long arg3, long arg4);
+void __sanitizer_syscall_pre_impl_fanotify_init(long flags, long event_f_flags);
+void __sanitizer_syscall_post_impl_fanotify_init(long res, long flags,
+                                                 long event_f_flags);
+void __sanitizer_syscall_pre_impl_fanotify_mark(long fanotify_fd, long flags,
+                                                long mask, long fd,
+                                                long pathname);
+void __sanitizer_syscall_post_impl_fanotify_mark(long res, long fanotify_fd,
+                                                 long flags, long mask, long fd,
+                                                 long pathname);
+void __sanitizer_syscall_pre_impl_syncfs(long fd);
+void __sanitizer_syscall_post_impl_syncfs(long res, long fd);
+void __sanitizer_syscall_pre_impl_perf_event_open(long attr_uptr, long pid,
+                                                  long cpu, long group_fd,
+                                                  long flags);
+void __sanitizer_syscall_post_impl_perf_event_open(long res, long attr_uptr,
+                                                   long pid, long cpu,
+                                                   long group_fd, long flags);
+void __sanitizer_syscall_pre_impl_mmap_pgoff(long addr, long len, long prot,
+                                             long flags, long fd, long pgoff);
+void __sanitizer_syscall_post_impl_mmap_pgoff(long res, long addr, long len,
+                                              long prot, long flags, long fd,
+                                              long pgoff);
+void __sanitizer_syscall_pre_impl_old_mmap(long arg);
+void __sanitizer_syscall_post_impl_old_mmap(long res, long arg);
+void __sanitizer_syscall_pre_impl_name_to_handle_at(long dfd, long name,
+                                                    long handle, long mnt_id,
+                                                    long flag);
+void __sanitizer_syscall_post_impl_name_to_handle_at(long res, long dfd,
+                                                     long name, long handle,
+                                                     long mnt_id, long flag);
+void __sanitizer_syscall_pre_impl_open_by_handle_at(long mountdirfd,
+                                                    long handle, long flags);
+void __sanitizer_syscall_post_impl_open_by_handle_at(long res, long mountdirfd,
+                                                     long handle, long flags);
+void __sanitizer_syscall_pre_impl_setns(long fd, long nstype);
+void __sanitizer_syscall_post_impl_setns(long res, long fd, long nstype);
+void __sanitizer_syscall_pre_impl_process_vm_readv(long pid, long lvec,
+                                                   long liovcnt, long rvec,
+                                                   long riovcnt, long flags);
+void __sanitizer_syscall_post_impl_process_vm_readv(long res, long pid,
+                                                    long lvec, long liovcnt,
+                                                    long rvec, long riovcnt,
+                                                    long flags);
+void __sanitizer_syscall_pre_impl_process_vm_writev(long pid, long lvec,
+                                                    long liovcnt, long rvec,
+                                                    long riovcnt, long flags);
+void __sanitizer_syscall_post_impl_process_vm_writev(long res, long pid,
+                                                     long lvec, long liovcnt,
+                                                     long rvec, long riovcnt,
+                                                     long flags);
+void __sanitizer_syscall_pre_impl_fork();
+void __sanitizer_syscall_post_impl_fork(long res);
+void __sanitizer_syscall_pre_impl_vfork();
+void __sanitizer_syscall_post_impl_vfork(long res);
+void __sanitizer_syscall_pre_impl_sigaction(long signum, long act, long oldact);
+void __sanitizer_syscall_post_impl_sigaction(long res, long signum, long act,
+                                             long oldact);
+void __sanitizer_syscall_pre_impl_rt_sigaction(long signum, long act,
+                                               long oldact, long sz);
+void __sanitizer_syscall_post_impl_rt_sigaction(long res, long signum, long act,
+                                                long oldact, long sz);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_LINUX_SYSCALL_HOOKS_H
diff --git a/darwin-x86/clang-headers/sanitizer/lsan_interface.h b/darwin-x86/clang-headers/sanitizer/lsan_interface.h
new file mode 100644
index 0000000..731b373
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/lsan_interface.h
@@ -0,0 +1,90 @@
+//===-- sanitizer/lsan_interface.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of LeakSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_LSAN_INTERFACE_H
+#define SANITIZER_LSAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // Allocations made between calls to __lsan_disable() and __lsan_enable() will
+  // be treated as non-leaks. Disable/enable pairs may be nested.
+  void __lsan_disable(void);
+  void __lsan_enable(void);
+
+  // The heap object into which p points will be treated as a non-leak.
+  void __lsan_ignore_object(const void *p);
+
+  // Memory regions registered through this interface will be treated as sources
+  // of live pointers during leak checking. Useful if you store pointers in
+  // mapped memory.
+  // Points of note:
+  // - __lsan_unregister_root_region() must be called with the same pointer and
+  // size that have earlier been passed to __lsan_register_root_region()
+  // - LSan will skip any inaccessible memory when scanning a root region. E.g.,
+  // if you map memory within a larger region that you have mprotect'ed, you can
+  // register the entire large region.
+  // - the implementation is not optimized for performance. This interface is
+  // intended to be used for a small number of relatively static regions.
+  void __lsan_register_root_region(const void *p, size_t size);
+  void __lsan_unregister_root_region(const void *p, size_t size);
+
+  // Check for leaks now. This function behaves identically to the default
+  // end-of-process leak check. In particular, it will terminate the process if
+  // leaks are found and the exitcode runtime flag is non-zero.
+  // Subsequent calls to this function will have no effect and end-of-process
+  // leak check will not run. Effectively, end-of-process leak check is moved to
+  // the time of first invocation of this function.
+  // By calling this function early during process shutdown, you can instruct
+  // LSan to ignore shutdown-only leaks which happen later on.
+  void __lsan_do_leak_check(void);
+
+  // Check for leaks now. Returns zero if no leaks have been found or if leak
+  // detection is disabled, non-zero otherwise.
+  // This function may be called repeatedly, e.g. to periodically check a
+  // long-running process. It prints a leak report if appropriate, but does not
+  // terminate the process. It does not affect the behavior of
+  // __lsan_do_leak_check() or the end-of-process leak check, and is not
+  // affected by them.
+  int __lsan_do_recoverable_leak_check(void);
+
+  // The user may optionally provide this function to disallow leak checking
+  // for the program it is linked into (if the return value is non-zero). This
+  // function must be defined as returning a constant value; any behavior beyond
+  // that is unsupported.
+  // To avoid dead stripping, you may need to define this function with
+  // __attribute__((used))
+  int __lsan_is_turned_off(void);
+
+  // This function may be optionally provided by user and should return
+  // a string containing LSan runtime options. See lsan_flags.inc for details.
+  const char *__lsan_default_options(void);
+
+  // This function may be optionally provided by the user and should return
+  // a string containing LSan suppressions.
+  const char *__lsan_default_suppressions(void);
+#ifdef __cplusplus
+}  // extern "C"
+
+namespace __lsan {
+class ScopedDisabler {
+ public:
+  ScopedDisabler() { __lsan_disable(); }
+  ~ScopedDisabler() { __lsan_enable(); }
+};
+}  // namespace __lsan
+#endif
+
+#endif  // SANITIZER_LSAN_INTERFACE_H
diff --git a/darwin-x86/clang-headers/sanitizer/msan_interface.h b/darwin-x86/clang-headers/sanitizer/msan_interface.h
new file mode 100644
index 0000000..0509551
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/msan_interface.h
@@ -0,0 +1,119 @@
+//===-- msan_interface.h --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of MemorySanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef MSAN_INTERFACE_H
+#define MSAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  /* Set raw origin for the memory range. */
+  void __msan_set_origin(const volatile void *a, size_t size, uint32_t origin);
+
+  /* Get raw origin for an address. */
+  uint32_t __msan_get_origin(const volatile void *a);
+
+  /* Test that this_id is a descendant of prev_id (or they are simply equal).
+   * "descendant" here means they are part of the same chain, created with
+   * __msan_chain_origin. */
+  int __msan_origin_is_descendant_or_same(uint32_t this_id, uint32_t prev_id);
+
+  /* Returns non-zero if tracking origins. */
+  int __msan_get_track_origins(void);
+
+  /* Returns the origin id of the latest UMR in the calling thread. */
+  uint32_t __msan_get_umr_origin(void);
+
+  /* Make memory region fully initialized (without changing its contents). */
+  void __msan_unpoison(const volatile void *a, size_t size);
+
+  /* Make a null-terminated string fully initialized (without changing its
+     contents). */
+  void __msan_unpoison_string(const volatile char *a);
+
+  /* Make memory region fully uninitialized (without changing its contents).
+     This is a legacy interface that does not update origin information. Use
+     __msan_allocated_memory() instead. */
+  void __msan_poison(const volatile void *a, size_t size);
+
+  /* Make memory region partially uninitialized (without changing its contents).
+   */
+  void __msan_partial_poison(const volatile void *data, void *shadow,
+                             size_t size);
+
+  /* Returns the offset of the first (at least partially) poisoned byte in the
+     memory range, or -1 if the whole range is good. */
+  intptr_t __msan_test_shadow(const volatile void *x, size_t size);
+
+  /* Checks that memory range is fully initialized, and reports an error if it
+   * is not. */
+  void __msan_check_mem_is_initialized(const volatile void *x, size_t size);
+
+  /* For testing:
+     __msan_set_expect_umr(1);
+     ... some buggy code ...
+     __msan_set_expect_umr(0);
+     The last line will verify that a UMR happened. */
+  void __msan_set_expect_umr(int expect_umr);
+
+  /* Change the value of keep_going flag. Non-zero value means don't terminate
+     program execution when an error is detected. This will not affect error in
+     modules that were compiled without the corresponding compiler flag. */
+  void __msan_set_keep_going(int keep_going);
+
+  /* Print shadow and origin for the memory range to stderr in a human-readable
+     format. */
+  void __msan_print_shadow(const volatile void *x, size_t size);
+
+  /* Print shadow for the memory range to stderr in a minimalistic
+     human-readable format. */
+  void __msan_dump_shadow(const volatile void *x, size_t size);
+
+  /* Returns true if running under a dynamic tool (DynamoRio-based). */
+  int  __msan_has_dynamic_component(void);
+
+  /* Tell MSan about newly allocated memory (ex.: custom allocator).
+     Memory will be marked uninitialized, with origin at the call site. */
+  void __msan_allocated_memory(const volatile void* data, size_t size);
+
+  /* Tell MSan about newly destroyed memory. Mark memory as uninitialized. */
+  void __sanitizer_dtor_callback(const volatile void* data, size_t size);
+
+  /* This function may be optionally provided by user and should return
+     a string containing Msan runtime options. See msan_flags.h for details. */
+  const char* __msan_default_options(void);
+
+  /* Deprecated. Call __sanitizer_set_death_callback instead. */
+  void __msan_set_death_callback(void (*callback)(void));
+
+  /* Update shadow for the application copy of size bytes from src to dst.
+     Src and dst are application addresses. This function does not copy the
+     actual application memory, it only updates shadow and origin for such
+     copy. Source and destination regions can overlap. */
+  void __msan_copy_shadow(const volatile void *dst, const volatile void *src,
+                          size_t size);
+
+  /* Disables uninitialized memory checks in interceptors. */
+  void __msan_scoped_disable_interceptor_checks(void);
+
+  /* Re-enables uninitialized memory checks in interceptors after a previous
+     call to __msan_scoped_disable_interceptor_checks. */
+  void __msan_scoped_enable_interceptor_checks(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/darwin-x86/clang-headers/sanitizer/netbsd_syscall_hooks.h b/darwin-x86/clang-headers/sanitizer/netbsd_syscall_hooks.h
new file mode 100644
index 0000000..4c6c6a8
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/netbsd_syscall_hooks.h
@@ -0,0 +1,4734 @@
+//===-- netbsd_syscall_hooks.h --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of public sanitizer interface.
+//
+// System call handlers.
+//
+// Interface methods declared in this header implement pre- and post- syscall
+// actions for the active sanitizer.
+// Usage:
+//   __sanitizer_syscall_pre_getfoo(...args...);
+//   long long res = syscall(SYS_getfoo, ...args...);
+//   __sanitizer_syscall_post_getfoo(res, ...args...);
+//
+// DO NOT EDIT! THIS FILE HAS BEEN GENERATED!
+//
+// Generated with: generate_netbsd_syscalls.awk
+// Generated date: 2018-03-03
+// Generated from: syscalls.master,v 1.291 2018/01/06 16:41:23 kamil Exp
+//
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_NETBSD_SYSCALL_HOOKS_H
+#define SANITIZER_NETBSD_SYSCALL_HOOKS_H
+
+#define __sanitizer_syscall_pre_syscall(code, arg0, arg1, arg2, arg3, arg4,    \
+                                        arg5, arg6, arg7)                      \
+  __sanitizer_syscall_pre_impl_syscall(                                        \
+      (long long)(code), (long long)(arg0), (long long)(arg1),                 \
+      (long long)(arg2), (long long)(arg3), (long long)(arg4),                 \
+      (long long)(arg5), (long long)(arg6), (long long)(arg7))
+#define __sanitizer_syscall_post_syscall(res, code, arg0, arg1, arg2, arg3,    \
+                                         arg4, arg5, arg6, arg7)               \
+  __sanitizer_syscall_post_impl_syscall(                                       \
+      res, (long long)(code), (long long)(arg0), (long long)(arg1),            \
+      (long long)(arg2), (long long)(arg3), (long long)(arg4),                 \
+      (long long)(arg5), (long long)(arg6), (long long)(arg7))
+#define __sanitizer_syscall_pre_exit(rval)                                     \
+  __sanitizer_syscall_pre_impl_exit((long long)(rval))
+#define __sanitizer_syscall_post_exit(res, rval)                               \
+  __sanitizer_syscall_post_impl_exit(res, (long long)(rval))
+#define __sanitizer_syscall_pre_fork() __sanitizer_syscall_pre_impl_fork()
+#define __sanitizer_syscall_post_fork(res)                                     \
+  __sanitizer_syscall_post_impl_fork(res)
+#define __sanitizer_syscall_pre_read(fd, buf, nbyte)                           \
+  __sanitizer_syscall_pre_impl_read((long long)(fd), (long long)(buf),         \
+                                    (long long)(nbyte))
+#define __sanitizer_syscall_post_read(res, fd, buf, nbyte)                     \
+  __sanitizer_syscall_post_impl_read(res, (long long)(fd), (long long)(buf),   \
+                                     (long long)(nbyte))
+#define __sanitizer_syscall_pre_write(fd, buf, nbyte)                          \
+  __sanitizer_syscall_pre_impl_write((long long)(fd), (long long)(buf),        \
+                                     (long long)(nbyte))
+#define __sanitizer_syscall_post_write(res, fd, buf, nbyte)                    \
+  __sanitizer_syscall_post_impl_write(res, (long long)(fd), (long long)(buf),  \
+                                      (long long)(nbyte))
+#define __sanitizer_syscall_pre_open(path, flags, mode)                        \
+  __sanitizer_syscall_pre_impl_open((long long)(path), (long long)(flags),     \
+                                    (long long)(mode))
+#define __sanitizer_syscall_post_open(res, path, flags, mode)                  \
+  __sanitizer_syscall_post_impl_open(res, (long long)(path),                   \
+                                     (long long)(flags), (long long)(mode))
+#define __sanitizer_syscall_pre_close(fd)                                      \
+  __sanitizer_syscall_pre_impl_close((long long)(fd))
+#define __sanitizer_syscall_post_close(res, fd)                                \
+  __sanitizer_syscall_post_impl_close(res, (long long)(fd))
+#define __sanitizer_syscall_pre_compat_50_wait4(pid, status, options, rusage)  \
+  __sanitizer_syscall_pre_impl_compat_50_wait4(                                \
+      (long long)(pid), (long long)(status), (long long)(options),             \
+      (long long)(rusage))
+#define __sanitizer_syscall_post_compat_50_wait4(res, pid, status, options,    \
+                                                 rusage)                       \
+  __sanitizer_syscall_post_impl_compat_50_wait4(                               \
+      res, (long long)(pid), (long long)(status), (long long)(options),        \
+      (long long)(rusage))
+#define __sanitizer_syscall_pre_compat_43_ocreat(path, mode)                   \
+  __sanitizer_syscall_pre_impl_compat_43_ocreat((long long)(path),             \
+                                                (long long)(mode))
+#define __sanitizer_syscall_post_compat_43_ocreat(res, path, mode)             \
+  __sanitizer_syscall_post_impl_compat_43_ocreat(res, (long long)(path),       \
+                                                 (long long)(mode))
+#define __sanitizer_syscall_pre_link(path, link)                               \
+  __sanitizer_syscall_pre_impl_link((long long)(path), (long long)(link))
+#define __sanitizer_syscall_post_link(res, path, link)                         \
+  __sanitizer_syscall_post_impl_link(res, (long long)(path), (long long)(link))
+#define __sanitizer_syscall_pre_unlink(path)                                   \
+  __sanitizer_syscall_pre_impl_unlink((long long)(path))
+#define __sanitizer_syscall_post_unlink(res, path)                             \
+  __sanitizer_syscall_post_impl_unlink(res, (long long)(path))
+/* syscall 11 has been skipped */
+#define __sanitizer_syscall_pre_chdir(path)                                    \
+  __sanitizer_syscall_pre_impl_chdir((long long)(path))
+#define __sanitizer_syscall_post_chdir(res, path)                              \
+  __sanitizer_syscall_post_impl_chdir(res, (long long)(path))
+#define __sanitizer_syscall_pre_fchdir(fd)                                     \
+  __sanitizer_syscall_pre_impl_fchdir((long long)(fd))
+#define __sanitizer_syscall_post_fchdir(res, fd)                               \
+  __sanitizer_syscall_post_impl_fchdir(res, (long long)(fd))
+#define __sanitizer_syscall_pre_compat_50_mknod(path, mode, dev)               \
+  __sanitizer_syscall_pre_impl_compat_50_mknod(                                \
+      (long long)(path), (long long)(mode), (long long)(dev))
+#define __sanitizer_syscall_post_compat_50_mknod(res, path, mode, dev)         \
+  __sanitizer_syscall_post_impl_compat_50_mknod(                               \
+      res, (long long)(path), (long long)(mode), (long long)(dev))
+#define __sanitizer_syscall_pre_chmod(path, mode)                              \
+  __sanitizer_syscall_pre_impl_chmod((long long)(path), (long long)(mode))
+#define __sanitizer_syscall_post_chmod(res, path, mode)                        \
+  __sanitizer_syscall_post_impl_chmod(res, (long long)(path), (long long)(mode))
+#define __sanitizer_syscall_pre_chown(path, uid, gid)                          \
+  __sanitizer_syscall_pre_impl_chown((long long)(path), (long long)(uid),      \
+                                     (long long)(gid))
+#define __sanitizer_syscall_post_chown(res, path, uid, gid)                    \
+  __sanitizer_syscall_post_impl_chown(res, (long long)(path),                  \
+                                      (long long)(uid), (long long)(gid))
+#define __sanitizer_syscall_pre_break(nsize)                                   \
+  __sanitizer_syscall_pre_impl_break((long long)(nsize))
+#define __sanitizer_syscall_post_break(res, nsize)                             \
+  __sanitizer_syscall_post_impl_break(res, (long long)(nsize))
+#define __sanitizer_syscall_pre_compat_20_getfsstat(buf, bufsize, flags)       \
+  __sanitizer_syscall_pre_impl_compat_20_getfsstat(                            \
+      (long long)(buf), (long long)(bufsize), (long long)(flags))
+#define __sanitizer_syscall_post_compat_20_getfsstat(res, buf, bufsize, flags) \
+  __sanitizer_syscall_post_impl_compat_20_getfsstat(                           \
+      res, (long long)(buf), (long long)(bufsize), (long long)(flags))
+#define __sanitizer_syscall_pre_compat_43_olseek(fd, offset, whence)           \
+  __sanitizer_syscall_pre_impl_compat_43_olseek(                               \
+      (long long)(fd), (long long)(offset), (long long)(whence))
+#define __sanitizer_syscall_post_compat_43_olseek(res, fd, offset, whence)     \
+  __sanitizer_syscall_post_impl_compat_43_olseek(                              \
+      res, (long long)(fd), (long long)(offset), (long long)(whence))
+#define __sanitizer_syscall_pre_getpid() __sanitizer_syscall_pre_impl_getpid()
+#define __sanitizer_syscall_post_getpid(res)                                   \
+  __sanitizer_syscall_post_impl_getpid(res)
+#define __sanitizer_syscall_pre_compat_40_mount(type, path, flags, data)       \
+  __sanitizer_syscall_pre_impl_compat_40_mount(                                \
+      (long long)(type), (long long)(path), (long long)(flags),                \
+      (long long)(data))
+#define __sanitizer_syscall_post_compat_40_mount(res, type, path, flags, data) \
+  __sanitizer_syscall_post_impl_compat_40_mount(                               \
+      res, (long long)(type), (long long)(path), (long long)(flags),           \
+      (long long)(data))
+#define __sanitizer_syscall_pre_unmount(path, flags)                           \
+  __sanitizer_syscall_pre_impl_unmount((long long)(path), (long long)(flags))
+#define __sanitizer_syscall_post_unmount(res, path, flags)                     \
+  __sanitizer_syscall_post_impl_unmount(res, (long long)(path),                \
+                                        (long long)(flags))
+#define __sanitizer_syscall_pre_setuid(uid)                                    \
+  __sanitizer_syscall_pre_impl_setuid((long long)(uid))
+#define __sanitizer_syscall_post_setuid(res, uid)                              \
+  __sanitizer_syscall_post_impl_setuid(res, (long long)(uid))
+#define __sanitizer_syscall_pre_getuid() __sanitizer_syscall_pre_impl_getuid()
+#define __sanitizer_syscall_post_getuid(res)                                   \
+  __sanitizer_syscall_post_impl_getuid(res)
+#define __sanitizer_syscall_pre_geteuid() __sanitizer_syscall_pre_impl_geteuid()
+#define __sanitizer_syscall_post_geteuid(res)                                  \
+  __sanitizer_syscall_post_impl_geteuid(res)
+#define __sanitizer_syscall_pre_ptrace(req, pid, addr, data)                   \
+  __sanitizer_syscall_pre_impl_ptrace((long long)(req), (long long)(pid),      \
+                                      (long long)(addr), (long long)(data))
+#define __sanitizer_syscall_post_ptrace(res, req, pid, addr, data)             \
+  __sanitizer_syscall_post_impl_ptrace(res, (long long)(req),                  \
+                                       (long long)(pid), (long long)(addr),    \
+                                       (long long)(data))
+#define __sanitizer_syscall_pre_recvmsg(s, msg, flags)                         \
+  __sanitizer_syscall_pre_impl_recvmsg((long long)(s), (long long)(msg),       \
+                                       (long long)(flags))
+#define __sanitizer_syscall_post_recvmsg(res, s, msg, flags)                   \
+  __sanitizer_syscall_post_impl_recvmsg(res, (long long)(s), (long long)(msg), \
+                                        (long long)(flags))
+#define __sanitizer_syscall_pre_sendmsg(s, msg, flags)                         \
+  __sanitizer_syscall_pre_impl_sendmsg((long long)(s), (long long)(msg),       \
+                                       (long long)(flags))
+#define __sanitizer_syscall_post_sendmsg(res, s, msg, flags)                   \
+  __sanitizer_syscall_post_impl_sendmsg(res, (long long)(s), (long long)(msg), \
+                                        (long long)(flags))
+#define __sanitizer_syscall_pre_recvfrom(s, buf, len, flags, from,             \
+                                         fromlenaddr)                          \
+  __sanitizer_syscall_pre_impl_recvfrom(                                       \
+      (long long)(s), (long long)(buf), (long long)(len), (long long)(flags),  \
+      (long long)(from), (long long)(fromlenaddr))
+#define __sanitizer_syscall_post_recvfrom(res, s, buf, len, flags, from,       \
+                                          fromlenaddr)                         \
+  __sanitizer_syscall_post_impl_recvfrom(                                      \
+      res, (long long)(s), (long long)(buf), (long long)(len),                 \
+      (long long)(flags), (long long)(from), (long long)(fromlenaddr))
+#define __sanitizer_syscall_pre_accept(s, name, anamelen)                      \
+  __sanitizer_syscall_pre_impl_accept((long long)(s), (long long)(name),       \
+                                      (long long)(anamelen))
+#define __sanitizer_syscall_post_accept(res, s, name, anamelen)                \
+  __sanitizer_syscall_post_impl_accept(res, (long long)(s), (long long)(name), \
+                                       (long long)(anamelen))
+#define __sanitizer_syscall_pre_getpeername(fdes, asa, alen)                   \
+  __sanitizer_syscall_pre_impl_getpeername(                                    \
+      (long long)(fdes), (long long)(asa), (long long)(alen))
+#define __sanitizer_syscall_post_getpeername(res, fdes, asa, alen)             \
+  __sanitizer_syscall_post_impl_getpeername(                                   \
+      res, (long long)(fdes), (long long)(asa), (long long)(alen))
+#define __sanitizer_syscall_pre_getsockname(fdes, asa, alen)                   \
+  __sanitizer_syscall_pre_impl_getsockname(                                    \
+      (long long)(fdes), (long long)(asa), (long long)(alen))
+#define __sanitizer_syscall_post_getsockname(res, fdes, asa, alen)             \
+  __sanitizer_syscall_post_impl_getsockname(                                   \
+      res, (long long)(fdes), (long long)(asa), (long long)(alen))
+#define __sanitizer_syscall_pre_access(path, flags)                            \
+  __sanitizer_syscall_pre_impl_access((long long)(path), (long long)(flags))
+#define __sanitizer_syscall_post_access(res, path, flags)                      \
+  __sanitizer_syscall_post_impl_access(res, (long long)(path),                 \
+                                       (long long)(flags))
+#define __sanitizer_syscall_pre_chflags(path, flags)                           \
+  __sanitizer_syscall_pre_impl_chflags((long long)(path), (long long)(flags))
+#define __sanitizer_syscall_post_chflags(res, path, flags)                     \
+  __sanitizer_syscall_post_impl_chflags(res, (long long)(path),                \
+                                        (long long)(flags))
+#define __sanitizer_syscall_pre_fchflags(fd, flags)                            \
+  __sanitizer_syscall_pre_impl_fchflags((long long)(fd), (long long)(flags))
+#define __sanitizer_syscall_post_fchflags(res, fd, flags)                      \
+  __sanitizer_syscall_post_impl_fchflags(res, (long long)(fd),                 \
+                                         (long long)(flags))
+#define __sanitizer_syscall_pre_sync() __sanitizer_syscall_pre_impl_sync()
+#define __sanitizer_syscall_post_sync(res)                                     \
+  __sanitizer_syscall_post_impl_sync(res)
+#define __sanitizer_syscall_pre_kill(pid, signum)                              \
+  __sanitizer_syscall_pre_impl_kill((long long)(pid), (long long)(signum))
+#define __sanitizer_syscall_post_kill(res, pid, signum)                        \
+  __sanitizer_syscall_post_impl_kill(res, (long long)(pid), (long long)(signum))
+#define __sanitizer_syscall_pre_compat_43_stat43(path, ub)                     \
+  __sanitizer_syscall_pre_impl_compat_43_stat43((long long)(path),             \
+                                                (long long)(ub))
+#define __sanitizer_syscall_post_compat_43_stat43(res, path, ub)               \
+  __sanitizer_syscall_post_impl_compat_43_stat43(res, (long long)(path),       \
+                                                 (long long)(ub))
+#define __sanitizer_syscall_pre_getppid() __sanitizer_syscall_pre_impl_getppid()
+#define __sanitizer_syscall_post_getppid(res)                                  \
+  __sanitizer_syscall_post_impl_getppid(res)
+#define __sanitizer_syscall_pre_compat_43_lstat43(path, ub)                    \
+  __sanitizer_syscall_pre_impl_compat_43_lstat43((long long)(path),            \
+                                                 (long long)(ub))
+#define __sanitizer_syscall_post_compat_43_lstat43(res, path, ub)              \
+  __sanitizer_syscall_post_impl_compat_43_lstat43(res, (long long)(path),      \
+                                                  (long long)(ub))
+#define __sanitizer_syscall_pre_dup(fd)                                        \
+  __sanitizer_syscall_pre_impl_dup((long long)(fd))
+#define __sanitizer_syscall_post_dup(res, fd)                                  \
+  __sanitizer_syscall_post_impl_dup(res, (long long)(fd))
+#define __sanitizer_syscall_pre_pipe() __sanitizer_syscall_pre_impl_pipe()
+#define __sanitizer_syscall_post_pipe(res)                                     \
+  __sanitizer_syscall_post_impl_pipe(res)
+#define __sanitizer_syscall_pre_getegid() __sanitizer_syscall_pre_impl_getegid()
+#define __sanitizer_syscall_post_getegid(res)                                  \
+  __sanitizer_syscall_post_impl_getegid(res)
+#define __sanitizer_syscall_pre_profil(samples, size, offset, scale)           \
+  __sanitizer_syscall_pre_impl_profil((long long)(samples), (long long)(size), \
+                                      (long long)(offset), (long long)(scale))
+#define __sanitizer_syscall_post_profil(res, samples, size, offset, scale)     \
+  __sanitizer_syscall_post_impl_profil(res, (long long)(samples),              \
+                                       (long long)(size), (long long)(offset), \
+                                       (long long)(scale))
+#define __sanitizer_syscall_pre_ktrace(fname, ops, facs, pid)                  \
+  __sanitizer_syscall_pre_impl_ktrace((long long)(fname), (long long)(ops),    \
+                                      (long long)(facs), (long long)(pid))
+#define __sanitizer_syscall_post_ktrace(res, fname, ops, facs, pid)            \
+  __sanitizer_syscall_post_impl_ktrace(res, (long long)(fname),                \
+                                       (long long)(ops), (long long)(facs),    \
+                                       (long long)(pid))
+#define __sanitizer_syscall_pre_compat_13_sigaction13(signum, nsa, osa)        \
+  __sanitizer_syscall_pre_impl_compat_13_sigaction13(                          \
+      (long long)(signum), (long long)(nsa), (long long)(osa))
+#define __sanitizer_syscall_post_compat_13_sigaction13(res, signum, nsa, osa)  \
+  __sanitizer_syscall_post_impl_compat_13_sigaction13(                         \
+      res, (long long)(signum), (long long)(nsa), (long long)(osa))
+#define __sanitizer_syscall_pre_getgid() __sanitizer_syscall_pre_impl_getgid()
+#define __sanitizer_syscall_post_getgid(res)                                   \
+  __sanitizer_syscall_post_impl_getgid(res)
+#define __sanitizer_syscall_pre_compat_13_sigprocmask13(how, mask)             \
+  __sanitizer_syscall_pre_impl_compat_13_sigprocmask13((long long)(how),       \
+                                                       (long long)(mask))
+#define __sanitizer_syscall_post_compat_13_sigprocmask13(res, how, mask)       \
+  __sanitizer_syscall_post_impl_compat_13_sigprocmask13(res, (long long)(how), \
+                                                        (long long)(mask))
+#define __sanitizer_syscall_pre___getlogin(namebuf, namelen)                   \
+  __sanitizer_syscall_pre_impl___getlogin((long long)(namebuf),                \
+                                          (long long)(namelen))
+#define __sanitizer_syscall_post___getlogin(res, namebuf, namelen)             \
+  __sanitizer_syscall_post_impl___getlogin(res, (long long)(namebuf),          \
+                                           (long long)(namelen))
+#define __sanitizer_syscall_pre___setlogin(namebuf)                            \
+  __sanitizer_syscall_pre_impl___setlogin((long long)(namebuf))
+#define __sanitizer_syscall_post___setlogin(res, namebuf)                      \
+  __sanitizer_syscall_post_impl___setlogin(res, (long long)(namebuf))
+#define __sanitizer_syscall_pre_acct(path)                                     \
+  __sanitizer_syscall_pre_impl_acct((long long)(path))
+#define __sanitizer_syscall_post_acct(res, path)                               \
+  __sanitizer_syscall_post_impl_acct(res, (long long)(path))
+#define __sanitizer_syscall_pre_compat_13_sigpending13()                       \
+  __sanitizer_syscall_pre_impl_compat_13_sigpending13()
+#define __sanitizer_syscall_post_compat_13_sigpending13(res)                   \
+  __sanitizer_syscall_post_impl_compat_13_sigpending13(res)
+#define __sanitizer_syscall_pre_compat_13_sigaltstack13(nss, oss)              \
+  __sanitizer_syscall_pre_impl_compat_13_sigaltstack13((long long)(nss),       \
+                                                       (long long)(oss))
+#define __sanitizer_syscall_post_compat_13_sigaltstack13(res, nss, oss)        \
+  __sanitizer_syscall_post_impl_compat_13_sigaltstack13(res, (long long)(nss), \
+                                                        (long long)(oss))
+#define __sanitizer_syscall_pre_ioctl(fd, com, data)                           \
+  __sanitizer_syscall_pre_impl_ioctl((long long)(fd), (long long)(com),        \
+                                     (long long)(data))
+#define __sanitizer_syscall_post_ioctl(res, fd, com, data)                     \
+  __sanitizer_syscall_post_impl_ioctl(res, (long long)(fd), (long long)(com),  \
+                                      (long long)(data))
+#define __sanitizer_syscall_pre_compat_12_oreboot(opt)                         \
+  __sanitizer_syscall_pre_impl_compat_12_oreboot((long long)(opt))
+#define __sanitizer_syscall_post_compat_12_oreboot(res, opt)                   \
+  __sanitizer_syscall_post_impl_compat_12_oreboot(res, (long long)(opt))
+#define __sanitizer_syscall_pre_revoke(path)                                   \
+  __sanitizer_syscall_pre_impl_revoke((long long)(path))
+#define __sanitizer_syscall_post_revoke(res, path)                             \
+  __sanitizer_syscall_post_impl_revoke(res, (long long)(path))
+#define __sanitizer_syscall_pre_symlink(path, link)                            \
+  __sanitizer_syscall_pre_impl_symlink((long long)(path), (long long)(link))
+#define __sanitizer_syscall_post_symlink(res, path, link)                      \
+  __sanitizer_syscall_post_impl_symlink(res, (long long)(path),                \
+                                        (long long)(link))
+#define __sanitizer_syscall_pre_readlink(path, buf, count)                     \
+  __sanitizer_syscall_pre_impl_readlink((long long)(path), (long long)(buf),   \
+                                        (long long)(count))
+#define __sanitizer_syscall_post_readlink(res, path, buf, count)               \
+  __sanitizer_syscall_post_impl_readlink(res, (long long)(path),               \
+                                         (long long)(buf), (long long)(count))
+#define __sanitizer_syscall_pre_execve(path, argp, envp)                       \
+  __sanitizer_syscall_pre_impl_execve((long long)(path), (long long)(argp),    \
+                                      (long long)(envp))
+#define __sanitizer_syscall_post_execve(res, path, argp, envp)                 \
+  __sanitizer_syscall_post_impl_execve(res, (long long)(path),                 \
+                                       (long long)(argp), (long long)(envp))
+#define __sanitizer_syscall_pre_umask(newmask)                                 \
+  __sanitizer_syscall_pre_impl_umask((long long)(newmask))
+#define __sanitizer_syscall_post_umask(res, newmask)                           \
+  __sanitizer_syscall_post_impl_umask(res, (long long)(newmask))
+#define __sanitizer_syscall_pre_chroot(path)                                   \
+  __sanitizer_syscall_pre_impl_chroot((long long)(path))
+#define __sanitizer_syscall_post_chroot(res, path)                             \
+  __sanitizer_syscall_post_impl_chroot(res, (long long)(path))
+#define __sanitizer_syscall_pre_compat_43_fstat43(fd, sb)                      \
+  __sanitizer_syscall_pre_impl_compat_43_fstat43((long long)(fd),              \
+                                                 (long long)(sb))
+#define __sanitizer_syscall_post_compat_43_fstat43(res, fd, sb)                \
+  __sanitizer_syscall_post_impl_compat_43_fstat43(res, (long long)(fd),        \
+                                                  (long long)(sb))
+#define __sanitizer_syscall_pre_compat_43_ogetkerninfo(op, where, size, arg)   \
+  __sanitizer_syscall_pre_impl_compat_43_ogetkerninfo(                         \
+      (long long)(op), (long long)(where), (long long)(size),                  \
+      (long long)(arg))
+#define __sanitizer_syscall_post_compat_43_ogetkerninfo(res, op, where, size,  \
+                                                        arg)                   \
+  __sanitizer_syscall_post_impl_compat_43_ogetkerninfo(                        \
+      res, (long long)(op), (long long)(where), (long long)(size),             \
+      (long long)(arg))
+#define __sanitizer_syscall_pre_compat_43_ogetpagesize()                       \
+  __sanitizer_syscall_pre_impl_compat_43_ogetpagesize()
+#define __sanitizer_syscall_post_compat_43_ogetpagesize(res)                   \
+  __sanitizer_syscall_post_impl_compat_43_ogetpagesize(res)
+#define __sanitizer_syscall_pre_compat_12_msync(addr, len)                     \
+  __sanitizer_syscall_pre_impl_compat_12_msync((long long)(addr),              \
+                                               (long long)(len))
+#define __sanitizer_syscall_post_compat_12_msync(res, addr, len)               \
+  __sanitizer_syscall_post_impl_compat_12_msync(res, (long long)(addr),        \
+                                                (long long)(len))
+#define __sanitizer_syscall_pre_vfork() __sanitizer_syscall_pre_impl_vfork()
+#define __sanitizer_syscall_post_vfork(res)                                    \
+  __sanitizer_syscall_post_impl_vfork(res)
+/* syscall 67 has been skipped */
+/* syscall 68 has been skipped */
+/* syscall 69 has been skipped */
+/* syscall 70 has been skipped */
+#define __sanitizer_syscall_pre_compat_43_ommap(addr, len, prot, flags, fd,    \
+                                                pos)                           \
+  __sanitizer_syscall_pre_impl_compat_43_ommap(                                \
+      (long long)(addr), (long long)(len), (long long)(prot),                  \
+      (long long)(flags), (long long)(fd), (long long)(pos))
+#define __sanitizer_syscall_post_compat_43_ommap(res, addr, len, prot, flags,  \
+                                                 fd, pos)                      \
+  __sanitizer_syscall_post_impl_compat_43_ommap(                               \
+      res, (long long)(addr), (long long)(len), (long long)(prot),             \
+      (long long)(flags), (long long)(fd), (long long)(pos))
+#define __sanitizer_syscall_pre_vadvise(anom)                                  \
+  __sanitizer_syscall_pre_impl_vadvise((long long)(anom))
+#define __sanitizer_syscall_post_vadvise(res, anom)                            \
+  __sanitizer_syscall_post_impl_vadvise(res, (long long)(anom))
+#define __sanitizer_syscall_pre_munmap(addr, len)                              \
+  __sanitizer_syscall_pre_impl_munmap((long long)(addr), (long long)(len))
+#define __sanitizer_syscall_post_munmap(res, addr, len)                        \
+  __sanitizer_syscall_post_impl_munmap(res, (long long)(addr), (long long)(len))
+#define __sanitizer_syscall_pre_mprotect(addr, len, prot)                      \
+  __sanitizer_syscall_pre_impl_mprotect((long long)(addr), (long long)(len),   \
+                                        (long long)(prot))
+#define __sanitizer_syscall_post_mprotect(res, addr, len, prot)                \
+  __sanitizer_syscall_post_impl_mprotect(res, (long long)(addr),               \
+                                         (long long)(len), (long long)(prot))
+#define __sanitizer_syscall_pre_madvise(addr, len, behav)                      \
+  __sanitizer_syscall_pre_impl_madvise((long long)(addr), (long long)(len),    \
+                                       (long long)(behav))
+#define __sanitizer_syscall_post_madvise(res, addr, len, behav)                \
+  __sanitizer_syscall_post_impl_madvise(res, (long long)(addr),                \
+                                        (long long)(len), (long long)(behav))
+/* syscall 76 has been skipped */
+/* syscall 77 has been skipped */
+#define __sanitizer_syscall_pre_mincore(addr, len, vec)                        \
+  __sanitizer_syscall_pre_impl_mincore((long long)(addr), (long long)(len),    \
+                                       (long long)(vec))
+#define __sanitizer_syscall_post_mincore(res, addr, len, vec)                  \
+  __sanitizer_syscall_post_impl_mincore(res, (long long)(addr),                \
+                                        (long long)(len), (long long)(vec))
+#define __sanitizer_syscall_pre_getgroups(gidsetsize, gidset)                  \
+  __sanitizer_syscall_pre_impl_getgroups((long long)(gidsetsize),              \
+                                         (long long)(gidset))
+#define __sanitizer_syscall_post_getgroups(res, gidsetsize, gidset)            \
+  __sanitizer_syscall_post_impl_getgroups(res, (long long)(gidsetsize),        \
+                                          (long long)(gidset))
+#define __sanitizer_syscall_pre_setgroups(gidsetsize, gidset)                  \
+  __sanitizer_syscall_pre_impl_setgroups((long long)(gidsetsize),              \
+                                         (long long)(gidset))
+#define __sanitizer_syscall_post_setgroups(res, gidsetsize, gidset)            \
+  __sanitizer_syscall_post_impl_setgroups(res, (long long)(gidsetsize),        \
+                                          (long long)(gidset))
+#define __sanitizer_syscall_pre_getpgrp() __sanitizer_syscall_pre_impl_getpgrp()
+#define __sanitizer_syscall_post_getpgrp(res)                                  \
+  __sanitizer_syscall_post_impl_getpgrp(res)
+#define __sanitizer_syscall_pre_setpgid(pid, pgid)                             \
+  __sanitizer_syscall_pre_impl_setpgid((long long)(pid), (long long)(pgid))
+#define __sanitizer_syscall_post_setpgid(res, pid, pgid)                       \
+  __sanitizer_syscall_post_impl_setpgid(res, (long long)(pid),                 \
+                                        (long long)(pgid))
+#define __sanitizer_syscall_pre_compat_50_setitimer(which, itv, oitv)          \
+  __sanitizer_syscall_pre_impl_compat_50_setitimer(                            \
+      (long long)(which), (long long)(itv), (long long)(oitv))
+#define __sanitizer_syscall_post_compat_50_setitimer(res, which, itv, oitv)    \
+  __sanitizer_syscall_post_impl_compat_50_setitimer(                           \
+      res, (long long)(which), (long long)(itv), (long long)(oitv))
+#define __sanitizer_syscall_pre_compat_43_owait()                              \
+  __sanitizer_syscall_pre_impl_compat_43_owait()
+#define __sanitizer_syscall_post_compat_43_owait(res)                          \
+  __sanitizer_syscall_post_impl_compat_43_owait(res)
+#define __sanitizer_syscall_pre_compat_12_oswapon(name)                        \
+  __sanitizer_syscall_pre_impl_compat_12_oswapon((long long)(name))
+#define __sanitizer_syscall_post_compat_12_oswapon(res, name)                  \
+  __sanitizer_syscall_post_impl_compat_12_oswapon(res, (long long)(name))
+#define __sanitizer_syscall_pre_compat_50_getitimer(which, itv)                \
+  __sanitizer_syscall_pre_impl_compat_50_getitimer((long long)(which),         \
+                                                   (long long)(itv))
+#define __sanitizer_syscall_post_compat_50_getitimer(res, which, itv)          \
+  __sanitizer_syscall_post_impl_compat_50_getitimer(res, (long long)(which),   \
+                                                    (long long)(itv))
+#define __sanitizer_syscall_pre_compat_43_ogethostname(hostname, len)          \
+  __sanitizer_syscall_pre_impl_compat_43_ogethostname((long long)(hostname),   \
+                                                      (long long)(len))
+#define __sanitizer_syscall_post_compat_43_ogethostname(res, hostname, len)    \
+  __sanitizer_syscall_post_impl_compat_43_ogethostname(                        \
+      res, (long long)(hostname), (long long)(len))
+#define __sanitizer_syscall_pre_compat_43_osethostname(hostname, len)          \
+  __sanitizer_syscall_pre_impl_compat_43_osethostname((long long)(hostname),   \
+                                                      (long long)(len))
+#define __sanitizer_syscall_post_compat_43_osethostname(res, hostname, len)    \
+  __sanitizer_syscall_post_impl_compat_43_osethostname(                        \
+      res, (long long)(hostname), (long long)(len))
+#define __sanitizer_syscall_pre_compat_43_ogetdtablesize()                     \
+  __sanitizer_syscall_pre_impl_compat_43_ogetdtablesize()
+#define __sanitizer_syscall_post_compat_43_ogetdtablesize(res)                 \
+  __sanitizer_syscall_post_impl_compat_43_ogetdtablesize(res)
+#define __sanitizer_syscall_pre_dup2(from, to)                                 \
+  __sanitizer_syscall_pre_impl_dup2((long long)(from), (long long)(to))
+#define __sanitizer_syscall_post_dup2(res, from, to)                           \
+  __sanitizer_syscall_post_impl_dup2(res, (long long)(from), (long long)(to))
+/* syscall 91 has been skipped */
+#define __sanitizer_syscall_pre_fcntl(fd, cmd, arg)                            \
+  __sanitizer_syscall_pre_impl_fcntl((long long)(fd), (long long)(cmd),        \
+                                     (long long)(arg))
+#define __sanitizer_syscall_post_fcntl(res, fd, cmd, arg)                      \
+  __sanitizer_syscall_post_impl_fcntl(res, (long long)(fd), (long long)(cmd),  \
+                                      (long long)(arg))
+#define __sanitizer_syscall_pre_compat_50_select(nd, in, ou, ex, tv)           \
+  __sanitizer_syscall_pre_impl_compat_50_select(                               \
+      (long long)(nd), (long long)(in), (long long)(ou), (long long)(ex),      \
+      (long long)(tv))
+#define __sanitizer_syscall_post_compat_50_select(res, nd, in, ou, ex, tv)     \
+  __sanitizer_syscall_post_impl_compat_50_select(                              \
+      res, (long long)(nd), (long long)(in), (long long)(ou), (long long)(ex), \
+      (long long)(tv))
+/* syscall 94 has been skipped */
+#define __sanitizer_syscall_pre_fsync(fd)                                      \
+  __sanitizer_syscall_pre_impl_fsync((long long)(fd))
+#define __sanitizer_syscall_post_fsync(res, fd)                                \
+  __sanitizer_syscall_post_impl_fsync(res, (long long)(fd))
+#define __sanitizer_syscall_pre_setpriority(which, who, prio)                  \
+  __sanitizer_syscall_pre_impl_setpriority(                                    \
+      (long long)(which), (long long)(who), (long long)(prio))
+#define __sanitizer_syscall_post_setpriority(res, which, who, prio)            \
+  __sanitizer_syscall_post_impl_setpriority(                                   \
+      res, (long long)(which), (long long)(who), (long long)(prio))
+#define __sanitizer_syscall_pre_compat_30_socket(domain, type, protocol)       \
+  __sanitizer_syscall_pre_impl_compat_30_socket(                               \
+      (long long)(domain), (long long)(type), (long long)(protocol))
+#define __sanitizer_syscall_post_compat_30_socket(res, domain, type, protocol) \
+  __sanitizer_syscall_post_impl_compat_30_socket(                              \
+      res, (long long)(domain), (long long)(type), (long long)(protocol))
+#define __sanitizer_syscall_pre_connect(s, name, namelen)                      \
+  __sanitizer_syscall_pre_impl_connect((long long)(s), (long long)(name),      \
+                                       (long long)(namelen))
+#define __sanitizer_syscall_post_connect(res, s, name, namelen)                \
+  __sanitizer_syscall_post_impl_connect(                                       \
+      res, (long long)(s), (long long)(name), (long long)(namelen))
+#define __sanitizer_syscall_pre_compat_43_oaccept(s, name, anamelen)           \
+  __sanitizer_syscall_pre_impl_compat_43_oaccept(                              \
+      (long long)(s), (long long)(name), (long long)(anamelen))
+#define __sanitizer_syscall_post_compat_43_oaccept(res, s, name, anamelen)     \
+  __sanitizer_syscall_post_impl_compat_43_oaccept(                             \
+      res, (long long)(s), (long long)(name), (long long)(anamelen))
+#define __sanitizer_syscall_pre_getpriority(which, who)                        \
+  __sanitizer_syscall_pre_impl_getpriority((long long)(which), (long long)(who))
+#define __sanitizer_syscall_post_getpriority(res, which, who)                  \
+  __sanitizer_syscall_post_impl_getpriority(res, (long long)(which),           \
+                                            (long long)(who))
+#define __sanitizer_syscall_pre_compat_43_osend(s, buf, len, flags)            \
+  __sanitizer_syscall_pre_impl_compat_43_osend(                                \
+      (long long)(s), (long long)(buf), (long long)(len), (long long)(flags))
+#define __sanitizer_syscall_post_compat_43_osend(res, s, buf, len, flags)      \
+  __sanitizer_syscall_post_impl_compat_43_osend(                               \
+      res, (long long)(s), (long long)(buf), (long long)(len),                 \
+      (long long)(flags))
+#define __sanitizer_syscall_pre_compat_43_orecv(s, buf, len, flags)            \
+  __sanitizer_syscall_pre_impl_compat_43_orecv(                                \
+      (long long)(s), (long long)(buf), (long long)(len), (long long)(flags))
+#define __sanitizer_syscall_post_compat_43_orecv(res, s, buf, len, flags)      \
+  __sanitizer_syscall_post_impl_compat_43_orecv(                               \
+      res, (long long)(s), (long long)(buf), (long long)(len),                 \
+      (long long)(flags))
+#define __sanitizer_syscall_pre_compat_13_sigreturn13(sigcntxp)                \
+  __sanitizer_syscall_pre_impl_compat_13_sigreturn13((long long)(sigcntxp))
+#define __sanitizer_syscall_post_compat_13_sigreturn13(res, sigcntxp)          \
+  __sanitizer_syscall_post_impl_compat_13_sigreturn13(res,                     \
+                                                      (long long)(sigcntxp))
+#define __sanitizer_syscall_pre_bind(s, name, namelen)                         \
+  __sanitizer_syscall_pre_impl_bind((long long)(s), (long long)(name),         \
+                                    (long long)(namelen))
+#define __sanitizer_syscall_post_bind(res, s, name, namelen)                   \
+  __sanitizer_syscall_post_impl_bind(res, (long long)(s), (long long)(name),   \
+                                     (long long)(namelen))
+#define __sanitizer_syscall_pre_setsockopt(s, level, name, val, valsize)       \
+  __sanitizer_syscall_pre_impl_setsockopt((long long)(s), (long long)(level),  \
+                                          (long long)(name), (long long)(val), \
+                                          (long long)(valsize))
+#define __sanitizer_syscall_post_setsockopt(res, s, level, name, val, valsize) \
+  __sanitizer_syscall_post_impl_setsockopt(                                    \
+      res, (long long)(s), (long long)(level), (long long)(name),              \
+      (long long)(val), (long long)(valsize))
+#define __sanitizer_syscall_pre_listen(s, backlog)                             \
+  __sanitizer_syscall_pre_impl_listen((long long)(s), (long long)(backlog))
+#define __sanitizer_syscall_post_listen(res, s, backlog)                       \
+  __sanitizer_syscall_post_impl_listen(res, (long long)(s),                    \
+                                       (long long)(backlog))
+/* syscall 107 has been skipped */
+#define __sanitizer_syscall_pre_compat_43_osigvec(signum, nsv, osv)            \
+  __sanitizer_syscall_pre_impl_compat_43_osigvec(                              \
+      (long long)(signum), (long long)(nsv), (long long)(osv))
+#define __sanitizer_syscall_post_compat_43_osigvec(res, signum, nsv, osv)      \
+  __sanitizer_syscall_post_impl_compat_43_osigvec(                             \
+      res, (long long)(signum), (long long)(nsv), (long long)(osv))
+#define __sanitizer_syscall_pre_compat_43_osigblock(mask)                      \
+  __sanitizer_syscall_pre_impl_compat_43_osigblock((long long)(mask))
+#define __sanitizer_syscall_post_compat_43_osigblock(res, mask)                \
+  __sanitizer_syscall_post_impl_compat_43_osigblock(res, (long long)(mask))
+#define __sanitizer_syscall_pre_compat_43_osigsetmask(mask)                    \
+  __sanitizer_syscall_pre_impl_compat_43_osigsetmask((long long)(mask))
+#define __sanitizer_syscall_post_compat_43_osigsetmask(res, mask)              \
+  __sanitizer_syscall_post_impl_compat_43_osigsetmask(res, (long long)(mask))
+#define __sanitizer_syscall_pre_compat_13_sigsuspend13(mask)                   \
+  __sanitizer_syscall_pre_impl_compat_13_sigsuspend13((long long)(mask))
+#define __sanitizer_syscall_post_compat_13_sigsuspend13(res, mask)             \
+  __sanitizer_syscall_post_impl_compat_13_sigsuspend13(res, (long long)(mask))
+#define __sanitizer_syscall_pre_compat_43_osigstack(nss, oss)                  \
+  __sanitizer_syscall_pre_impl_compat_43_osigstack((long long)(nss),           \
+                                                   (long long)(oss))
+#define __sanitizer_syscall_post_compat_43_osigstack(res, nss, oss)            \
+  __sanitizer_syscall_post_impl_compat_43_osigstack(res, (long long)(nss),     \
+                                                    (long long)(oss))
+#define __sanitizer_syscall_pre_compat_43_orecvmsg(s, msg, flags)              \
+  __sanitizer_syscall_pre_impl_compat_43_orecvmsg(                             \
+      (long long)(s), (long long)(msg), (long long)(flags))
+#define __sanitizer_syscall_post_compat_43_orecvmsg(res, s, msg, flags)        \
+  __sanitizer_syscall_post_impl_compat_43_orecvmsg(                            \
+      res, (long long)(s), (long long)(msg), (long long)(flags))
+#define __sanitizer_syscall_pre_compat_43_osendmsg(s, msg, flags)              \
+  __sanitizer_syscall_pre_impl_compat_43_osendmsg(                             \
+      (long long)(s), (long long)(msg), (long long)(flags))
+#define __sanitizer_syscall_post_compat_43_osendmsg(res, s, msg, flags)        \
+  __sanitizer_syscall_post_impl_compat_43_osendmsg(                            \
+      res, (long long)(s), (long long)(msg), (long long)(flags))
+/* syscall 115 has been skipped */
+#define __sanitizer_syscall_pre_compat_50_gettimeofday(tp, tzp)                \
+  __sanitizer_syscall_pre_impl_compat_50_gettimeofday((long long)(tp),         \
+                                                      (long long)(tzp))
+#define __sanitizer_syscall_post_compat_50_gettimeofday(res, tp, tzp)          \
+  __sanitizer_syscall_post_impl_compat_50_gettimeofday(res, (long long)(tp),   \
+                                                       (long long)(tzp))
+#define __sanitizer_syscall_pre_compat_50_getrusage(who, rusage)               \
+  __sanitizer_syscall_pre_impl_compat_50_getrusage((long long)(who),           \
+                                                   (long long)(rusage))
+#define __sanitizer_syscall_post_compat_50_getrusage(res, who, rusage)         \
+  __sanitizer_syscall_post_impl_compat_50_getrusage(res, (long long)(who),     \
+                                                    (long long)(rusage))
+#define __sanitizer_syscall_pre_getsockopt(s, level, name, val, avalsize)      \
+  __sanitizer_syscall_pre_impl_getsockopt((long long)(s), (long long)(level),  \
+                                          (long long)(name), (long long)(val), \
+                                          (long long)(avalsize))
+#define __sanitizer_syscall_post_getsockopt(res, s, level, name, val,          \
+                                            avalsize)                          \
+  __sanitizer_syscall_post_impl_getsockopt(                                    \
+      res, (long long)(s), (long long)(level), (long long)(name),              \
+      (long long)(val), (long long)(avalsize))
+/* syscall 119 has been skipped */
+#define __sanitizer_syscall_pre_readv(fd, iovp, iovcnt)                        \
+  __sanitizer_syscall_pre_impl_readv((long long)(fd), (long long)(iovp),       \
+                                     (long long)(iovcnt))
+#define __sanitizer_syscall_post_readv(res, fd, iovp, iovcnt)                  \
+  __sanitizer_syscall_post_impl_readv(res, (long long)(fd), (long long)(iovp), \
+                                      (long long)(iovcnt))
+#define __sanitizer_syscall_pre_writev(fd, iovp, iovcnt)                       \
+  __sanitizer_syscall_pre_impl_writev((long long)(fd), (long long)(iovp),      \
+                                      (long long)(iovcnt))
+#define __sanitizer_syscall_post_writev(res, fd, iovp, iovcnt)                 \
+  __sanitizer_syscall_post_impl_writev(res, (long long)(fd),                   \
+                                       (long long)(iovp), (long long)(iovcnt))
+#define __sanitizer_syscall_pre_compat_50_settimeofday(tv, tzp)                \
+  __sanitizer_syscall_pre_impl_compat_50_settimeofday((long long)(tv),         \
+                                                      (long long)(tzp))
+#define __sanitizer_syscall_post_compat_50_settimeofday(res, tv, tzp)          \
+  __sanitizer_syscall_post_impl_compat_50_settimeofday(res, (long long)(tv),   \
+                                                       (long long)(tzp))
+#define __sanitizer_syscall_pre_fchown(fd, uid, gid)                           \
+  __sanitizer_syscall_pre_impl_fchown((long long)(fd), (long long)(uid),       \
+                                      (long long)(gid))
+#define __sanitizer_syscall_post_fchown(res, fd, uid, gid)                     \
+  __sanitizer_syscall_post_impl_fchown(res, (long long)(fd), (long long)(uid), \
+                                       (long long)(gid))
+#define __sanitizer_syscall_pre_fchmod(fd, mode)                               \
+  __sanitizer_syscall_pre_impl_fchmod((long long)(fd), (long long)(mode))
+#define __sanitizer_syscall_post_fchmod(res, fd, mode)                         \
+  __sanitizer_syscall_post_impl_fchmod(res, (long long)(fd), (long long)(mode))
+#define __sanitizer_syscall_pre_compat_43_orecvfrom(s, buf, len, flags, from,  \
+                                                    fromlenaddr)               \
+  __sanitizer_syscall_pre_impl_compat_43_orecvfrom(                            \
+      (long long)(s), (long long)(buf), (long long)(len), (long long)(flags),  \
+      (long long)(from), (long long)(fromlenaddr))
+#define __sanitizer_syscall_post_compat_43_orecvfrom(res, s, buf, len, flags,  \
+                                                     from, fromlenaddr)        \
+  __sanitizer_syscall_post_impl_compat_43_orecvfrom(                           \
+      res, (long long)(s), (long long)(buf), (long long)(len),                 \
+      (long long)(flags), (long long)(from), (long long)(fromlenaddr))
+#define __sanitizer_syscall_pre_setreuid(ruid, euid)                           \
+  __sanitizer_syscall_pre_impl_setreuid((long long)(ruid), (long long)(euid))
+#define __sanitizer_syscall_post_setreuid(res, ruid, euid)                     \
+  __sanitizer_syscall_post_impl_setreuid(res, (long long)(ruid),               \
+                                         (long long)(euid))
+#define __sanitizer_syscall_pre_setregid(rgid, egid)                           \
+  __sanitizer_syscall_pre_impl_setregid((long long)(rgid), (long long)(egid))
+#define __sanitizer_syscall_post_setregid(res, rgid, egid)                     \
+  __sanitizer_syscall_post_impl_setregid(res, (long long)(rgid),               \
+                                         (long long)(egid))
+#define __sanitizer_syscall_pre_rename(from, to)                               \
+  __sanitizer_syscall_pre_impl_rename((long long)(from), (long long)(to))
+#define __sanitizer_syscall_post_rename(res, from, to)                         \
+  __sanitizer_syscall_post_impl_rename(res, (long long)(from), (long long)(to))
+#define __sanitizer_syscall_pre_compat_43_otruncate(path, length)              \
+  __sanitizer_syscall_pre_impl_compat_43_otruncate((long long)(path),          \
+                                                   (long long)(length))
+#define __sanitizer_syscall_post_compat_43_otruncate(res, path, length)        \
+  __sanitizer_syscall_post_impl_compat_43_otruncate(res, (long long)(path),    \
+                                                    (long long)(length))
+#define __sanitizer_syscall_pre_compat_43_oftruncate(fd, length)               \
+  __sanitizer_syscall_pre_impl_compat_43_oftruncate((long long)(fd),           \
+                                                    (long long)(length))
+#define __sanitizer_syscall_post_compat_43_oftruncate(res, fd, length)         \
+  __sanitizer_syscall_post_impl_compat_43_oftruncate(res, (long long)(fd),     \
+                                                     (long long)(length))
+#define __sanitizer_syscall_pre_flock(fd, how)                                 \
+  __sanitizer_syscall_pre_impl_flock((long long)(fd), (long long)(how))
+#define __sanitizer_syscall_post_flock(res, fd, how)                           \
+  __sanitizer_syscall_post_impl_flock(res, (long long)(fd), (long long)(how))
+#define __sanitizer_syscall_pre_mkfifo(path, mode)                             \
+  __sanitizer_syscall_pre_impl_mkfifo((long long)(path), (long long)(mode))
+#define __sanitizer_syscall_post_mkfifo(res, path, mode)                       \
+  __sanitizer_syscall_post_impl_mkfifo(res, (long long)(path),                 \
+                                       (long long)(mode))
+#define __sanitizer_syscall_pre_sendto(s, buf, len, flags, to, tolen)          \
+  __sanitizer_syscall_pre_impl_sendto((long long)(s), (long long)(buf),        \
+                                      (long long)(len), (long long)(flags),    \
+                                      (long long)(to), (long long)(tolen))
+#define __sanitizer_syscall_post_sendto(res, s, buf, len, flags, to, tolen)    \
+  __sanitizer_syscall_post_impl_sendto(res, (long long)(s), (long long)(buf),  \
+                                       (long long)(len), (long long)(flags),   \
+                                       (long long)(to), (long long)(tolen))
+#define __sanitizer_syscall_pre_shutdown(s, how)                               \
+  __sanitizer_syscall_pre_impl_shutdown((long long)(s), (long long)(how))
+#define __sanitizer_syscall_post_shutdown(res, s, how)                         \
+  __sanitizer_syscall_post_impl_shutdown(res, (long long)(s), (long long)(how))
+#define __sanitizer_syscall_pre_socketpair(domain, type, protocol, rsv)        \
+  __sanitizer_syscall_pre_impl_socketpair(                                     \
+      (long long)(domain), (long long)(type), (long long)(protocol),           \
+      (long long)(rsv))
+#define __sanitizer_syscall_post_socketpair(res, domain, type, protocol, rsv)  \
+  __sanitizer_syscall_post_impl_socketpair(                                    \
+      res, (long long)(domain), (long long)(type), (long long)(protocol),      \
+      (long long)(rsv))
+#define __sanitizer_syscall_pre_mkdir(path, mode)                              \
+  __sanitizer_syscall_pre_impl_mkdir((long long)(path), (long long)(mode))
+#define __sanitizer_syscall_post_mkdir(res, path, mode)                        \
+  __sanitizer_syscall_post_impl_mkdir(res, (long long)(path), (long long)(mode))
+#define __sanitizer_syscall_pre_rmdir(path)                                    \
+  __sanitizer_syscall_pre_impl_rmdir((long long)(path))
+#define __sanitizer_syscall_post_rmdir(res, path)                              \
+  __sanitizer_syscall_post_impl_rmdir(res, (long long)(path))
+#define __sanitizer_syscall_pre_compat_50_utimes(path, tptr)                   \
+  __sanitizer_syscall_pre_impl_compat_50_utimes((long long)(path),             \
+                                                (long long)(tptr))
+#define __sanitizer_syscall_post_compat_50_utimes(res, path, tptr)             \
+  __sanitizer_syscall_post_impl_compat_50_utimes(res, (long long)(path),       \
+                                                 (long long)(tptr))
+/* syscall 139 has been skipped */
+#define __sanitizer_syscall_pre_compat_50_adjtime(delta, olddelta)             \
+  __sanitizer_syscall_pre_impl_compat_50_adjtime((long long)(delta),           \
+                                                 (long long)(olddelta))
+#define __sanitizer_syscall_post_compat_50_adjtime(res, delta, olddelta)       \
+  __sanitizer_syscall_post_impl_compat_50_adjtime(res, (long long)(delta),     \
+                                                  (long long)(olddelta))
+#define __sanitizer_syscall_pre_compat_43_ogetpeername(fdes, asa, alen)        \
+  __sanitizer_syscall_pre_impl_compat_43_ogetpeername(                         \
+      (long long)(fdes), (long long)(asa), (long long)(alen))
+#define __sanitizer_syscall_post_compat_43_ogetpeername(res, fdes, asa, alen)  \
+  __sanitizer_syscall_post_impl_compat_43_ogetpeername(                        \
+      res, (long long)(fdes), (long long)(asa), (long long)(alen))
+#define __sanitizer_syscall_pre_compat_43_ogethostid()                         \
+  __sanitizer_syscall_pre_impl_compat_43_ogethostid()
+#define __sanitizer_syscall_post_compat_43_ogethostid(res)                     \
+  __sanitizer_syscall_post_impl_compat_43_ogethostid(res)
+#define __sanitizer_syscall_pre_compat_43_osethostid(hostid)                   \
+  __sanitizer_syscall_pre_impl_compat_43_osethostid((long long)(hostid))
+#define __sanitizer_syscall_post_compat_43_osethostid(res, hostid)             \
+  __sanitizer_syscall_post_impl_compat_43_osethostid(res, (long long)(hostid))
+#define __sanitizer_syscall_pre_compat_43_ogetrlimit(which, rlp)               \
+  __sanitizer_syscall_pre_impl_compat_43_ogetrlimit((long long)(which),        \
+                                                    (long long)(rlp))
+#define __sanitizer_syscall_post_compat_43_ogetrlimit(res, which, rlp)         \
+  __sanitizer_syscall_post_impl_compat_43_ogetrlimit(res, (long long)(which),  \
+                                                     (long long)(rlp))
+#define __sanitizer_syscall_pre_compat_43_osetrlimit(which, rlp)               \
+  __sanitizer_syscall_pre_impl_compat_43_osetrlimit((long long)(which),        \
+                                                    (long long)(rlp))
+#define __sanitizer_syscall_post_compat_43_osetrlimit(res, which, rlp)         \
+  __sanitizer_syscall_post_impl_compat_43_osetrlimit(res, (long long)(which),  \
+                                                     (long long)(rlp))
+#define __sanitizer_syscall_pre_compat_43_okillpg(pgid, signum)                \
+  __sanitizer_syscall_pre_impl_compat_43_okillpg((long long)(pgid),            \
+                                                 (long long)(signum))
+#define __sanitizer_syscall_post_compat_43_okillpg(res, pgid, signum)          \
+  __sanitizer_syscall_post_impl_compat_43_okillpg(res, (long long)(pgid),      \
+                                                  (long long)(signum))
+#define __sanitizer_syscall_pre_setsid() __sanitizer_syscall_pre_impl_setsid()
+#define __sanitizer_syscall_post_setsid(res)                                   \
+  __sanitizer_syscall_post_impl_setsid(res)
+#define __sanitizer_syscall_pre_compat_50_quotactl(path, cmd, uid, arg)        \
+  __sanitizer_syscall_pre_impl_compat_50_quotactl(                             \
+      (long long)(path), (long long)(cmd), (long long)(uid), (long long)(arg))
+#define __sanitizer_syscall_post_compat_50_quotactl(res, path, cmd, uid, arg)  \
+  __sanitizer_syscall_post_impl_compat_50_quotactl(                            \
+      res, (long long)(path), (long long)(cmd), (long long)(uid),              \
+      (long long)(arg))
+#define __sanitizer_syscall_pre_compat_43_oquota()                             \
+  __sanitizer_syscall_pre_impl_compat_43_oquota()
+#define __sanitizer_syscall_post_compat_43_oquota(res)                         \
+  __sanitizer_syscall_post_impl_compat_43_oquota(res)
+#define __sanitizer_syscall_pre_compat_43_ogetsockname(fdec, asa, alen)        \
+  __sanitizer_syscall_pre_impl_compat_43_ogetsockname(                         \
+      (long long)(fdec), (long long)(asa), (long long)(alen))
+#define __sanitizer_syscall_post_compat_43_ogetsockname(res, fdec, asa, alen)  \
+  __sanitizer_syscall_post_impl_compat_43_ogetsockname(                        \
+      res, (long long)(fdec), (long long)(asa), (long long)(alen))
+/* syscall 151 has been skipped */
+/* syscall 152 has been skipped */
+/* syscall 153 has been skipped */
+/* syscall 154 has been skipped */
+#define __sanitizer_syscall_pre_nfssvc(flag, argp)                             \
+  __sanitizer_syscall_pre_impl_nfssvc((long long)(flag), (long long)(argp))
+#define __sanitizer_syscall_post_nfssvc(res, flag, argp)                       \
+  __sanitizer_syscall_post_impl_nfssvc(res, (long long)(flag),                 \
+                                       (long long)(argp))
+#define __sanitizer_syscall_pre_compat_43_ogetdirentries(fd, buf, count,       \
+                                                         basep)                \
+  __sanitizer_syscall_pre_impl_compat_43_ogetdirentries(                       \
+      (long long)(fd), (long long)(buf), (long long)(count),                   \
+      (long long)(basep))
+#define __sanitizer_syscall_post_compat_43_ogetdirentries(res, fd, buf, count, \
+                                                          basep)               \
+  __sanitizer_syscall_post_impl_compat_43_ogetdirentries(                      \
+      res, (long long)(fd), (long long)(buf), (long long)(count),              \
+      (long long)(basep))
+#define __sanitizer_syscall_pre_compat_20_statfs(path, buf)                    \
+  __sanitizer_syscall_pre_impl_compat_20_statfs((long long)(path),             \
+                                                (long long)(buf))
+#define __sanitizer_syscall_post_compat_20_statfs(res, path, buf)              \
+  __sanitizer_syscall_post_impl_compat_20_statfs(res, (long long)(path),       \
+                                                 (long long)(buf))
+#define __sanitizer_syscall_pre_compat_20_fstatfs(fd, buf)                     \
+  __sanitizer_syscall_pre_impl_compat_20_fstatfs((long long)(fd),              \
+                                                 (long long)(buf))
+#define __sanitizer_syscall_post_compat_20_fstatfs(res, fd, buf)               \
+  __sanitizer_syscall_post_impl_compat_20_fstatfs(res, (long long)(fd),        \
+                                                  (long long)(buf))
+/* syscall 159 has been skipped */
+/* syscall 160 has been skipped */
+#define __sanitizer_syscall_pre_compat_30_getfh(fname, fhp)                    \
+  __sanitizer_syscall_pre_impl_compat_30_getfh((long long)(fname),             \
+                                               (long long)(fhp))
+#define __sanitizer_syscall_post_compat_30_getfh(res, fname, fhp)              \
+  __sanitizer_syscall_post_impl_compat_30_getfh(res, (long long)(fname),       \
+                                                (long long)(fhp))
+#define __sanitizer_syscall_pre_compat_09_ogetdomainname(domainname, len)      \
+  __sanitizer_syscall_pre_impl_compat_09_ogetdomainname(                       \
+      (long long)(domainname), (long long)(len))
+#define __sanitizer_syscall_post_compat_09_ogetdomainname(res, domainname,     \
+                                                          len)                 \
+  __sanitizer_syscall_post_impl_compat_09_ogetdomainname(                      \
+      res, (long long)(domainname), (long long)(len))
+#define __sanitizer_syscall_pre_compat_09_osetdomainname(domainname, len)      \
+  __sanitizer_syscall_pre_impl_compat_09_osetdomainname(                       \
+      (long long)(domainname), (long long)(len))
+#define __sanitizer_syscall_post_compat_09_osetdomainname(res, domainname,     \
+                                                          len)                 \
+  __sanitizer_syscall_post_impl_compat_09_osetdomainname(                      \
+      res, (long long)(domainname), (long long)(len))
+#define __sanitizer_syscall_pre_compat_09_ouname(name)                         \
+  __sanitizer_syscall_pre_impl_compat_09_ouname((long long)(name))
+#define __sanitizer_syscall_post_compat_09_ouname(res, name)                   \
+  __sanitizer_syscall_post_impl_compat_09_ouname(res, (long long)(name))
+#define __sanitizer_syscall_pre_sysarch(op, parms)                             \
+  __sanitizer_syscall_pre_impl_sysarch((long long)(op), (long long)(parms))
+#define __sanitizer_syscall_post_sysarch(res, op, parms)                       \
+  __sanitizer_syscall_post_impl_sysarch(res, (long long)(op),                  \
+                                        (long long)(parms))
+/* syscall 166 has been skipped */
+/* syscall 167 has been skipped */
+/* syscall 168 has been skipped */
+#if !defined(_LP64)
+#define __sanitizer_syscall_pre_compat_10_osemsys(which, a2, a3, a4, a5)       \
+  __sanitizer_syscall_pre_impl_compat_10_osemsys(                              \
+      (long long)(which), (long long)(a2), (long long)(a3), (long long)(a4),   \
+      (long long)(a5))
+#define __sanitizer_syscall_post_compat_10_osemsys(res, which, a2, a3, a4, a5) \
+  __sanitizer_syscall_post_impl_compat_10_osemsys(                             \
+      res, (long long)(which), (long long)(a2), (long long)(a3),               \
+      (long long)(a4), (long long)(a5))
+#else
+/* syscall 169 has been skipped */
+#endif
+#if !defined(_LP64)
+#define __sanitizer_syscall_pre_compat_10_omsgsys(which, a2, a3, a4, a5, a6)   \
+  __sanitizer_syscall_pre_impl_compat_10_omsgsys(                              \
+      (long long)(which), (long long)(a2), (long long)(a3), (long long)(a4),   \
+      (long long)(a5), (long long)(a6))
+#define __sanitizer_syscall_post_compat_10_omsgsys(res, which, a2, a3, a4, a5, \
+                                                   a6)                         \
+  __sanitizer_syscall_post_impl_compat_10_omsgsys(                             \
+      res, (long long)(which), (long long)(a2), (long long)(a3),               \
+      (long long)(a4), (long long)(a5), (long long)(a6))
+#else
+/* syscall 170 has been skipped */
+#endif
+#if !defined(_LP64)
+#define __sanitizer_syscall_pre_compat_10_oshmsys(which, a2, a3, a4)           \
+  __sanitizer_syscall_pre_impl_compat_10_oshmsys(                              \
+      (long long)(which), (long long)(a2), (long long)(a3), (long long)(a4))
+#define __sanitizer_syscall_post_compat_10_oshmsys(res, which, a2, a3, a4)     \
+  __sanitizer_syscall_post_impl_compat_10_oshmsys(                             \
+      res, (long long)(which), (long long)(a2), (long long)(a3),               \
+      (long long)(a4))
+#else
+/* syscall 171 has been skipped */
+#endif
+/* syscall 172 has been skipped */
+#define __sanitizer_syscall_pre_pread(fd, buf, nbyte, PAD, offset)             \
+  __sanitizer_syscall_pre_impl_pread((long long)(fd), (long long)(buf),        \
+                                     (long long)(nbyte), (long long)(PAD),     \
+                                     (long long)(offset))
+#define __sanitizer_syscall_post_pread(res, fd, buf, nbyte, PAD, offset)       \
+  __sanitizer_syscall_post_impl_pread(res, (long long)(fd), (long long)(buf),  \
+                                      (long long)(nbyte), (long long)(PAD),    \
+                                      (long long)(offset))
+#define __sanitizer_syscall_pre_pwrite(fd, buf, nbyte, PAD, offset)            \
+  __sanitizer_syscall_pre_impl_pwrite((long long)(fd), (long long)(buf),       \
+                                      (long long)(nbyte), (long long)(PAD),    \
+                                      (long long)(offset))
+#define __sanitizer_syscall_post_pwrite(res, fd, buf, nbyte, PAD, offset)      \
+  __sanitizer_syscall_post_impl_pwrite(res, (long long)(fd), (long long)(buf), \
+                                       (long long)(nbyte), (long long)(PAD),   \
+                                       (long long)(offset))
+#define __sanitizer_syscall_pre_compat_30_ntp_gettime(ntvp)                    \
+  __sanitizer_syscall_pre_impl_compat_30_ntp_gettime((long long)(ntvp))
+#define __sanitizer_syscall_post_compat_30_ntp_gettime(res, ntvp)              \
+  __sanitizer_syscall_post_impl_compat_30_ntp_gettime(res, (long long)(ntvp))
+#if defined(NTP) || !defined(_KERNEL_OPT)
+#define __sanitizer_syscall_pre_ntp_adjtime(tp)                                \
+  __sanitizer_syscall_pre_impl_ntp_adjtime((long long)(tp))
+#define __sanitizer_syscall_post_ntp_adjtime(res, tp)                          \
+  __sanitizer_syscall_post_impl_ntp_adjtime(res, (long long)(tp))
+#else
+/* syscall 176 has been skipped */
+#endif
+/* syscall 177 has been skipped */
+/* syscall 178 has been skipped */
+/* syscall 179 has been skipped */
+/* syscall 180 has been skipped */
+#define __sanitizer_syscall_pre_setgid(gid)                                    \
+  __sanitizer_syscall_pre_impl_setgid((long long)(gid))
+#define __sanitizer_syscall_post_setgid(res, gid)                              \
+  __sanitizer_syscall_post_impl_setgid(res, (long long)(gid))
+#define __sanitizer_syscall_pre_setegid(egid)                                  \
+  __sanitizer_syscall_pre_impl_setegid((long long)(egid))
+#define __sanitizer_syscall_post_setegid(res, egid)                            \
+  __sanitizer_syscall_post_impl_setegid(res, (long long)(egid))
+#define __sanitizer_syscall_pre_seteuid(euid)                                  \
+  __sanitizer_syscall_pre_impl_seteuid((long long)(euid))
+#define __sanitizer_syscall_post_seteuid(res, euid)                            \
+  __sanitizer_syscall_post_impl_seteuid(res, (long long)(euid))
+#define __sanitizer_syscall_pre_lfs_bmapv(fsidp, blkiov, blkcnt)               \
+  __sanitizer_syscall_pre_impl_lfs_bmapv(                                      \
+      (long long)(fsidp), (long long)(blkiov), (long long)(blkcnt))
+#define __sanitizer_syscall_post_lfs_bmapv(res, fsidp, blkiov, blkcnt)         \
+  __sanitizer_syscall_post_impl_lfs_bmapv(                                     \
+      res, (long long)(fsidp), (long long)(blkiov), (long long)(blkcnt))
+#define __sanitizer_syscall_pre_lfs_markv(fsidp, blkiov, blkcnt)               \
+  __sanitizer_syscall_pre_impl_lfs_markv(                                      \
+      (long long)(fsidp), (long long)(blkiov), (long long)(blkcnt))
+#define __sanitizer_syscall_post_lfs_markv(res, fsidp, blkiov, blkcnt)         \
+  __sanitizer_syscall_post_impl_lfs_markv(                                     \
+      res, (long long)(fsidp), (long long)(blkiov), (long long)(blkcnt))
+#define __sanitizer_syscall_pre_lfs_segclean(fsidp, segment)                   \
+  __sanitizer_syscall_pre_impl_lfs_segclean((long long)(fsidp),                \
+                                            (long long)(segment))
+#define __sanitizer_syscall_post_lfs_segclean(res, fsidp, segment)             \
+  __sanitizer_syscall_post_impl_lfs_segclean(res, (long long)(fsidp),          \
+                                             (long long)(segment))
+#define __sanitizer_syscall_pre_compat_50_lfs_segwait(fsidp, tv)               \
+  __sanitizer_syscall_pre_impl_compat_50_lfs_segwait((long long)(fsidp),       \
+                                                     (long long)(tv))
+#define __sanitizer_syscall_post_compat_50_lfs_segwait(res, fsidp, tv)         \
+  __sanitizer_syscall_post_impl_compat_50_lfs_segwait(res, (long long)(fsidp), \
+                                                      (long long)(tv))
+#define __sanitizer_syscall_pre_compat_12_stat12(path, ub)                     \
+  __sanitizer_syscall_pre_impl_compat_12_stat12((long long)(path),             \
+                                                (long long)(ub))
+#define __sanitizer_syscall_post_compat_12_stat12(res, path, ub)               \
+  __sanitizer_syscall_post_impl_compat_12_stat12(res, (long long)(path),       \
+                                                 (long long)(ub))
+#define __sanitizer_syscall_pre_compat_12_fstat12(fd, sb)                      \
+  __sanitizer_syscall_pre_impl_compat_12_fstat12((long long)(fd),              \
+                                                 (long long)(sb))
+#define __sanitizer_syscall_post_compat_12_fstat12(res, fd, sb)                \
+  __sanitizer_syscall_post_impl_compat_12_fstat12(res, (long long)(fd),        \
+                                                  (long long)(sb))
+#define __sanitizer_syscall_pre_compat_12_lstat12(path, ub)                    \
+  __sanitizer_syscall_pre_impl_compat_12_lstat12((long long)(path),            \
+                                                 (long long)(ub))
+#define __sanitizer_syscall_post_compat_12_lstat12(res, path, ub)              \
+  __sanitizer_syscall_post_impl_compat_12_lstat12(res, (long long)(path),      \
+                                                  (long long)(ub))
+#define __sanitizer_syscall_pre_pathconf(path, name)                           \
+  __sanitizer_syscall_pre_impl_pathconf((long long)(path), (long long)(name))
+#define __sanitizer_syscall_post_pathconf(res, path, name)                     \
+  __sanitizer_syscall_post_impl_pathconf(res, (long long)(path),               \
+                                         (long long)(name))
+#define __sanitizer_syscall_pre_fpathconf(fd, name)                            \
+  __sanitizer_syscall_pre_impl_fpathconf((long long)(fd), (long long)(name))
+#define __sanitizer_syscall_post_fpathconf(res, fd, name)                      \
+  __sanitizer_syscall_post_impl_fpathconf(res, (long long)(fd),                \
+                                          (long long)(name))
+/* syscall 193 has been skipped */
+#define __sanitizer_syscall_pre_getrlimit(which, rlp)                          \
+  __sanitizer_syscall_pre_impl_getrlimit((long long)(which), (long long)(rlp))
+#define __sanitizer_syscall_post_getrlimit(res, which, rlp)                    \
+  __sanitizer_syscall_post_impl_getrlimit(res, (long long)(which),             \
+                                          (long long)(rlp))
+#define __sanitizer_syscall_pre_setrlimit(which, rlp)                          \
+  __sanitizer_syscall_pre_impl_setrlimit((long long)(which), (long long)(rlp))
+#define __sanitizer_syscall_post_setrlimit(res, which, rlp)                    \
+  __sanitizer_syscall_post_impl_setrlimit(res, (long long)(which),             \
+                                          (long long)(rlp))
+#define __sanitizer_syscall_pre_compat_12_getdirentries(fd, buf, count, basep) \
+  __sanitizer_syscall_pre_impl_compat_12_getdirentries(                        \
+      (long long)(fd), (long long)(buf), (long long)(count),                   \
+      (long long)(basep))
+#define __sanitizer_syscall_post_compat_12_getdirentries(res, fd, buf, count,  \
+                                                         basep)                \
+  __sanitizer_syscall_post_impl_compat_12_getdirentries(                       \
+      res, (long long)(fd), (long long)(buf), (long long)(count),              \
+      (long long)(basep))
+#define __sanitizer_syscall_pre_mmap(addr, len, prot, flags, fd, PAD, pos)     \
+  __sanitizer_syscall_pre_impl_mmap(                                           \
+      (long long)(addr), (long long)(len), (long long)(prot),                  \
+      (long long)(flags), (long long)(fd), (long long)(PAD), (long long)(pos))
+#define __sanitizer_syscall_post_mmap(res, addr, len, prot, flags, fd, PAD,    \
+                                      pos)                                     \
+  __sanitizer_syscall_post_impl_mmap(                                          \
+      res, (long long)(addr), (long long)(len), (long long)(prot),             \
+      (long long)(flags), (long long)(fd), (long long)(PAD), (long long)(pos))
+#define __sanitizer_syscall_pre___syscall(code, arg0, arg1, arg2, arg3, arg4,  \
+                                          arg5, arg6, arg7)                    \
+  __sanitizer_syscall_pre_impl___syscall(                                      \
+      (long long)(code), (long long)(arg0), (long long)(arg1),                 \
+      (long long)(arg2), (long long)(arg3), (long long)(arg4),                 \
+      (long long)(arg5), (long long)(arg6), (long long)(arg7))
+#define __sanitizer_syscall_post___syscall(res, code, arg0, arg1, arg2, arg3,  \
+                                           arg4, arg5, arg6, arg7)             \
+  __sanitizer_syscall_post_impl___syscall(                                     \
+      res, (long long)(code), (long long)(arg0), (long long)(arg1),            \
+      (long long)(arg2), (long long)(arg3), (long long)(arg4),                 \
+      (long long)(arg5), (long long)(arg6), (long long)(arg7))
+#define __sanitizer_syscall_pre_lseek(fd, PAD, offset, whence)                 \
+  __sanitizer_syscall_pre_impl_lseek((long long)(fd), (long long)(PAD),        \
+                                     (long long)(offset), (long long)(whence))
+#define __sanitizer_syscall_post_lseek(res, fd, PAD, offset, whence)           \
+  __sanitizer_syscall_post_impl_lseek(res, (long long)(fd), (long long)(PAD),  \
+                                      (long long)(offset),                     \
+                                      (long long)(whence))
+#define __sanitizer_syscall_pre_truncate(path, PAD, length)                    \
+  __sanitizer_syscall_pre_impl_truncate((long long)(path), (long long)(PAD),   \
+                                        (long long)(length))
+#define __sanitizer_syscall_post_truncate(res, path, PAD, length)              \
+  __sanitizer_syscall_post_impl_truncate(                                      \
+      res, (long long)(path), (long long)(PAD), (long long)(length))
+#define __sanitizer_syscall_pre_ftruncate(fd, PAD, length)                     \
+  __sanitizer_syscall_pre_impl_ftruncate((long long)(fd), (long long)(PAD),    \
+                                         (long long)(length))
+#define __sanitizer_syscall_post_ftruncate(res, fd, PAD, length)               \
+  __sanitizer_syscall_post_impl_ftruncate(                                     \
+      res, (long long)(fd), (long long)(PAD), (long long)(length))
+#define __sanitizer_syscall_pre___sysctl(name, namelen, oldv, oldlenp, newv,   \
+                                         newlen)                               \
+  __sanitizer_syscall_pre_impl___sysctl(                                       \
+      (long long)(name), (long long)(namelen), (long long)(oldv),              \
+      (long long)(oldlenp), (long long)(newv), (long long)(newlen))
+#define __sanitizer_syscall_post___sysctl(res, name, namelen, oldv, oldlenp,   \
+                                          newv, newlen)                        \
+  __sanitizer_syscall_post_impl___sysctl(                                      \
+      res, (long long)(name), (long long)(namelen), (long long)(oldv),         \
+      (long long)(oldlenp), (long long)(newv), (long long)(newlen))
+#define __sanitizer_syscall_pre_mlock(addr, len)                               \
+  __sanitizer_syscall_pre_impl_mlock((long long)(addr), (long long)(len))
+#define __sanitizer_syscall_post_mlock(res, addr, len)                         \
+  __sanitizer_syscall_post_impl_mlock(res, (long long)(addr), (long long)(len))
+#define __sanitizer_syscall_pre_munlock(addr, len)                             \
+  __sanitizer_syscall_pre_impl_munlock((long long)(addr), (long long)(len))
+#define __sanitizer_syscall_post_munlock(res, addr, len)                       \
+  __sanitizer_syscall_post_impl_munlock(res, (long long)(addr),                \
+                                        (long long)(len))
+#define __sanitizer_syscall_pre_undelete(path)                                 \
+  __sanitizer_syscall_pre_impl_undelete((long long)(path))
+#define __sanitizer_syscall_post_undelete(res, path)                           \
+  __sanitizer_syscall_post_impl_undelete(res, (long long)(path))
+#define __sanitizer_syscall_pre_compat_50_futimes(fd, tptr)                    \
+  __sanitizer_syscall_pre_impl_compat_50_futimes((long long)(fd),              \
+                                                 (long long)(tptr))
+#define __sanitizer_syscall_post_compat_50_futimes(res, fd, tptr)              \
+  __sanitizer_syscall_post_impl_compat_50_futimes(res, (long long)(fd),        \
+                                                  (long long)(tptr))
+#define __sanitizer_syscall_pre_getpgid(pid)                                   \
+  __sanitizer_syscall_pre_impl_getpgid((long long)(pid))
+#define __sanitizer_syscall_post_getpgid(res, pid)                             \
+  __sanitizer_syscall_post_impl_getpgid(res, (long long)(pid))
+#define __sanitizer_syscall_pre_reboot(opt, bootstr)                           \
+  __sanitizer_syscall_pre_impl_reboot((long long)(opt), (long long)(bootstr))
+#define __sanitizer_syscall_post_reboot(res, opt, bootstr)                     \
+  __sanitizer_syscall_post_impl_reboot(res, (long long)(opt),                  \
+                                       (long long)(bootstr))
+#define __sanitizer_syscall_pre_poll(fds, nfds, timeout)                       \
+  __sanitizer_syscall_pre_impl_poll((long long)(fds), (long long)(nfds),       \
+                                    (long long)(timeout))
+#define __sanitizer_syscall_post_poll(res, fds, nfds, timeout)                 \
+  __sanitizer_syscall_post_impl_poll(res, (long long)(fds), (long long)(nfds), \
+                                     (long long)(timeout))
+#define __sanitizer_syscall_pre_afssys(id, a1, a2, a3, a4, a5, a6)             \
+  __sanitizer_syscall_pre_impl_afssys(                                         \
+      (long long)(id), (long long)(a1), (long long)(a2), (long long)(a3),      \
+      (long long)(a4), (long long)(a5), (long long)(a6))
+#define __sanitizer_syscall_post_afssys(res, id, a1, a2, a3, a4, a5, a6)       \
+  __sanitizer_syscall_post_impl_afssys(                                        \
+      res, (long long)(id), (long long)(a1), (long long)(a2), (long long)(a3), \
+      (long long)(a4), (long long)(a5), (long long)(a6))
+/* syscall 211 has been skipped */
+/* syscall 212 has been skipped */
+/* syscall 213 has been skipped */
+/* syscall 214 has been skipped */
+/* syscall 215 has been skipped */
+/* syscall 216 has been skipped */
+/* syscall 217 has been skipped */
+/* syscall 218 has been skipped */
+/* syscall 219 has been skipped */
+#define __sanitizer_syscall_pre_compat_14___semctl(semid, semnum, cmd, arg)    \
+  __sanitizer_syscall_pre_impl_compat_14___semctl(                             \
+      (long long)(semid), (long long)(semnum), (long long)(cmd),               \
+      (long long)(arg))
+#define __sanitizer_syscall_post_compat_14___semctl(res, semid, semnum, cmd,   \
+                                                    arg)                       \
+  __sanitizer_syscall_post_impl_compat_14___semctl(                            \
+      res, (long long)(semid), (long long)(semnum), (long long)(cmd),          \
+      (long long)(arg))
+#define __sanitizer_syscall_pre_semget(key, nsems, semflg)                     \
+  __sanitizer_syscall_pre_impl_semget((long long)(key), (long long)(nsems),    \
+                                      (long long)(semflg))
+#define __sanitizer_syscall_post_semget(res, key, nsems, semflg)               \
+  __sanitizer_syscall_post_impl_semget(                                        \
+      res, (long long)(key), (long long)(nsems), (long long)(semflg))
+#define __sanitizer_syscall_pre_semop(semid, sops, nsops)                      \
+  __sanitizer_syscall_pre_impl_semop((long long)(semid), (long long)(sops),    \
+                                     (long long)(nsops))
+#define __sanitizer_syscall_post_semop(res, semid, sops, nsops)                \
+  __sanitizer_syscall_post_impl_semop(res, (long long)(semid),                 \
+                                      (long long)(sops), (long long)(nsops))
+#define __sanitizer_syscall_pre_semconfig(flag)                                \
+  __sanitizer_syscall_pre_impl_semconfig((long long)(flag))
+#define __sanitizer_syscall_post_semconfig(res, flag)                          \
+  __sanitizer_syscall_post_impl_semconfig(res, (long long)(flag))
+#define __sanitizer_syscall_pre_compat_14_msgctl(msqid, cmd, buf)              \
+  __sanitizer_syscall_pre_impl_compat_14_msgctl(                               \
+      (long long)(msqid), (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_post_compat_14_msgctl(res, msqid, cmd, buf)        \
+  __sanitizer_syscall_post_impl_compat_14_msgctl(                              \
+      res, (long long)(msqid), (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_pre_msgget(key, msgflg)                            \
+  __sanitizer_syscall_pre_impl_msgget((long long)(key), (long long)(msgflg))
+#define __sanitizer_syscall_post_msgget(res, key, msgflg)                      \
+  __sanitizer_syscall_post_impl_msgget(res, (long long)(key),                  \
+                                       (long long)(msgflg))
+#define __sanitizer_syscall_pre_msgsnd(msqid, msgp, msgsz, msgflg)             \
+  __sanitizer_syscall_pre_impl_msgsnd((long long)(msqid), (long long)(msgp),   \
+                                      (long long)(msgsz), (long long)(msgflg))
+#define __sanitizer_syscall_post_msgsnd(res, msqid, msgp, msgsz, msgflg)       \
+  __sanitizer_syscall_post_impl_msgsnd(res, (long long)(msqid),                \
+                                       (long long)(msgp), (long long)(msgsz),  \
+                                       (long long)(msgflg))
+#define __sanitizer_syscall_pre_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg)     \
+  __sanitizer_syscall_pre_impl_msgrcv((long long)(msqid), (long long)(msgp),   \
+                                      (long long)(msgsz), (long long)(msgtyp), \
+                                      (long long)(msgflg))
+#define __sanitizer_syscall_post_msgrcv(res, msqid, msgp, msgsz, msgtyp,       \
+                                        msgflg)                                \
+  __sanitizer_syscall_post_impl_msgrcv(                                        \
+      res, (long long)(msqid), (long long)(msgp), (long long)(msgsz),          \
+      (long long)(msgtyp), (long long)(msgflg))
+#define __sanitizer_syscall_pre_shmat(shmid, shmaddr, shmflg)                  \
+  __sanitizer_syscall_pre_impl_shmat((long long)(shmid), (long long)(shmaddr), \
+                                     (long long)(shmflg))
+#define __sanitizer_syscall_post_shmat(res, shmid, shmaddr, shmflg)            \
+  __sanitizer_syscall_post_impl_shmat(                                         \
+      res, (long long)(shmid), (long long)(shmaddr), (long long)(shmflg))
+#define __sanitizer_syscall_pre_compat_14_shmctl(shmid, cmd, buf)              \
+  __sanitizer_syscall_pre_impl_compat_14_shmctl(                               \
+      (long long)(shmid), (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_post_compat_14_shmctl(res, shmid, cmd, buf)        \
+  __sanitizer_syscall_post_impl_compat_14_shmctl(                              \
+      res, (long long)(shmid), (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_pre_shmdt(shmaddr)                                 \
+  __sanitizer_syscall_pre_impl_shmdt((long long)(shmaddr))
+#define __sanitizer_syscall_post_shmdt(res, shmaddr)                           \
+  __sanitizer_syscall_post_impl_shmdt(res, (long long)(shmaddr))
+#define __sanitizer_syscall_pre_shmget(key, size, shmflg)                      \
+  __sanitizer_syscall_pre_impl_shmget((long long)(key), (long long)(size),     \
+                                      (long long)(shmflg))
+#define __sanitizer_syscall_post_shmget(res, key, size, shmflg)                \
+  __sanitizer_syscall_post_impl_shmget(res, (long long)(key),                  \
+                                       (long long)(size), (long long)(shmflg))
+#define __sanitizer_syscall_pre_compat_50_clock_gettime(clock_id, tp)          \
+  __sanitizer_syscall_pre_impl_compat_50_clock_gettime((long long)(clock_id),  \
+                                                       (long long)(tp))
+#define __sanitizer_syscall_post_compat_50_clock_gettime(res, clock_id, tp)    \
+  __sanitizer_syscall_post_impl_compat_50_clock_gettime(                       \
+      res, (long long)(clock_id), (long long)(tp))
+#define __sanitizer_syscall_pre_compat_50_clock_settime(clock_id, tp)          \
+  __sanitizer_syscall_pre_impl_compat_50_clock_settime((long long)(clock_id),  \
+                                                       (long long)(tp))
+#define __sanitizer_syscall_post_compat_50_clock_settime(res, clock_id, tp)    \
+  __sanitizer_syscall_post_impl_compat_50_clock_settime(                       \
+      res, (long long)(clock_id), (long long)(tp))
+#define __sanitizer_syscall_pre_compat_50_clock_getres(clock_id, tp)           \
+  __sanitizer_syscall_pre_impl_compat_50_clock_getres((long long)(clock_id),   \
+                                                      (long long)(tp))
+#define __sanitizer_syscall_post_compat_50_clock_getres(res, clock_id, tp)     \
+  __sanitizer_syscall_post_impl_compat_50_clock_getres(                        \
+      res, (long long)(clock_id), (long long)(tp))
+#define __sanitizer_syscall_pre_timer_create(clock_id, evp, timerid)           \
+  __sanitizer_syscall_pre_impl_timer_create(                                   \
+      (long long)(clock_id), (long long)(evp), (long long)(timerid))
+#define __sanitizer_syscall_post_timer_create(res, clock_id, evp, timerid)     \
+  __sanitizer_syscall_post_impl_timer_create(                                  \
+      res, (long long)(clock_id), (long long)(evp), (long long)(timerid))
+#define __sanitizer_syscall_pre_timer_delete(timerid)                          \
+  __sanitizer_syscall_pre_impl_timer_delete((long long)(timerid))
+#define __sanitizer_syscall_post_timer_delete(res, timerid)                    \
+  __sanitizer_syscall_post_impl_timer_delete(res, (long long)(timerid))
+#define __sanitizer_syscall_pre_compat_50_timer_settime(timerid, flags, value, \
+                                                        ovalue)                \
+  __sanitizer_syscall_pre_impl_compat_50_timer_settime(                        \
+      (long long)(timerid), (long long)(flags), (long long)(value),            \
+      (long long)(ovalue))
+#define __sanitizer_syscall_post_compat_50_timer_settime(res, timerid, flags,  \
+                                                         value, ovalue)        \
+  __sanitizer_syscall_post_impl_compat_50_timer_settime(                       \
+      res, (long long)(timerid), (long long)(flags), (long long)(value),       \
+      (long long)(ovalue))
+#define __sanitizer_syscall_pre_compat_50_timer_gettime(timerid, value)        \
+  __sanitizer_syscall_pre_impl_compat_50_timer_gettime((long long)(timerid),   \
+                                                       (long long)(value))
+#define __sanitizer_syscall_post_compat_50_timer_gettime(res, timerid, value)  \
+  __sanitizer_syscall_post_impl_compat_50_timer_gettime(                       \
+      res, (long long)(timerid), (long long)(value))
+#define __sanitizer_syscall_pre_timer_getoverrun(timerid)                      \
+  __sanitizer_syscall_pre_impl_timer_getoverrun((long long)(timerid))
+#define __sanitizer_syscall_post_timer_getoverrun(res, timerid)                \
+  __sanitizer_syscall_post_impl_timer_getoverrun(res, (long long)(timerid))
+#define __sanitizer_syscall_pre_compat_50_nanosleep(rqtp, rmtp)                \
+  __sanitizer_syscall_pre_impl_compat_50_nanosleep((long long)(rqtp),          \
+                                                   (long long)(rmtp))
+#define __sanitizer_syscall_post_compat_50_nanosleep(res, rqtp, rmtp)          \
+  __sanitizer_syscall_post_impl_compat_50_nanosleep(res, (long long)(rqtp),    \
+                                                    (long long)(rmtp))
+#define __sanitizer_syscall_pre_fdatasync(fd)                                  \
+  __sanitizer_syscall_pre_impl_fdatasync((long long)(fd))
+#define __sanitizer_syscall_post_fdatasync(res, fd)                            \
+  __sanitizer_syscall_post_impl_fdatasync(res, (long long)(fd))
+#define __sanitizer_syscall_pre_mlockall(flags)                                \
+  __sanitizer_syscall_pre_impl_mlockall((long long)(flags))
+#define __sanitizer_syscall_post_mlockall(res, flags)                          \
+  __sanitizer_syscall_post_impl_mlockall(res, (long long)(flags))
+#define __sanitizer_syscall_pre_munlockall()                                   \
+  __sanitizer_syscall_pre_impl_munlockall()
+#define __sanitizer_syscall_post_munlockall(res)                               \
+  __sanitizer_syscall_post_impl_munlockall(res)
+#define __sanitizer_syscall_pre_compat_50___sigtimedwait(set, info, timeout)   \
+  __sanitizer_syscall_pre_impl_compat_50___sigtimedwait(                       \
+      (long long)(set), (long long)(info), (long long)(timeout))
+#define __sanitizer_syscall_post_compat_50___sigtimedwait(res, set, info,      \
+                                                          timeout)             \
+  __sanitizer_syscall_post_impl_compat_50___sigtimedwait(                      \
+      res, (long long)(set), (long long)(info), (long long)(timeout))
+#define __sanitizer_syscall_pre_sigqueueinfo(pid, info)                        \
+  __sanitizer_syscall_pre_impl_sigqueueinfo((long long)(pid), (long long)(info))
+#define __sanitizer_syscall_post_sigqueueinfo(res, pid, info)                  \
+  __sanitizer_syscall_post_impl_sigqueueinfo(res, (long long)(pid),            \
+                                             (long long)(info))
+#define __sanitizer_syscall_pre_modctl(cmd, arg)                               \
+  __sanitizer_syscall_pre_impl_modctl((long long)(cmd), (long long)(arg))
+#define __sanitizer_syscall_post_modctl(res, cmd, arg)                         \
+  __sanitizer_syscall_post_impl_modctl(res, (long long)(cmd), (long long)(arg))
+#define __sanitizer_syscall_pre__ksem_init(value, idp)                         \
+  __sanitizer_syscall_pre_impl__ksem_init((long long)(value), (long long)(idp))
+#define __sanitizer_syscall_post__ksem_init(res, value, idp)                   \
+  __sanitizer_syscall_post_impl__ksem_init(res, (long long)(value),            \
+                                           (long long)(idp))
+#define __sanitizer_syscall_pre__ksem_open(name, oflag, mode, value, idp)      \
+  __sanitizer_syscall_pre_impl__ksem_open(                                     \
+      (long long)(name), (long long)(oflag), (long long)(mode),                \
+      (long long)(value), (long long)(idp))
+#define __sanitizer_syscall_post__ksem_open(res, name, oflag, mode, value,     \
+                                            idp)                               \
+  __sanitizer_syscall_post_impl__ksem_open(                                    \
+      res, (long long)(name), (long long)(oflag), (long long)(mode),           \
+      (long long)(value), (long long)(idp))
+#define __sanitizer_syscall_pre__ksem_unlink(name)                             \
+  __sanitizer_syscall_pre_impl__ksem_unlink((long long)(name))
+#define __sanitizer_syscall_post__ksem_unlink(res, name)                       \
+  __sanitizer_syscall_post_impl__ksem_unlink(res, (long long)(name))
+#define __sanitizer_syscall_pre__ksem_close(id)                                \
+  __sanitizer_syscall_pre_impl__ksem_close((long long)(id))
+#define __sanitizer_syscall_post__ksem_close(res, id)                          \
+  __sanitizer_syscall_post_impl__ksem_close(res, (long long)(id))
+#define __sanitizer_syscall_pre__ksem_post(id)                                 \
+  __sanitizer_syscall_pre_impl__ksem_post((long long)(id))
+#define __sanitizer_syscall_post__ksem_post(res, id)                           \
+  __sanitizer_syscall_post_impl__ksem_post(res, (long long)(id))
+#define __sanitizer_syscall_pre__ksem_wait(id)                                 \
+  __sanitizer_syscall_pre_impl__ksem_wait((long long)(id))
+#define __sanitizer_syscall_post__ksem_wait(res, id)                           \
+  __sanitizer_syscall_post_impl__ksem_wait(res, (long long)(id))
+#define __sanitizer_syscall_pre__ksem_trywait(id)                              \
+  __sanitizer_syscall_pre_impl__ksem_trywait((long long)(id))
+#define __sanitizer_syscall_post__ksem_trywait(res, id)                        \
+  __sanitizer_syscall_post_impl__ksem_trywait(res, (long long)(id))
+#define __sanitizer_syscall_pre__ksem_getvalue(id, value)                      \
+  __sanitizer_syscall_pre_impl__ksem_getvalue((long long)(id),                 \
+                                              (long long)(value))
+#define __sanitizer_syscall_post__ksem_getvalue(res, id, value)                \
+  __sanitizer_syscall_post_impl__ksem_getvalue(res, (long long)(id),           \
+                                               (long long)(value))
+#define __sanitizer_syscall_pre__ksem_destroy(id)                              \
+  __sanitizer_syscall_pre_impl__ksem_destroy((long long)(id))
+#define __sanitizer_syscall_post__ksem_destroy(res, id)                        \
+  __sanitizer_syscall_post_impl__ksem_destroy(res, (long long)(id))
+#define __sanitizer_syscall_pre__ksem_timedwait(id, abstime)                   \
+  __sanitizer_syscall_pre_impl__ksem_timedwait((long long)(id),                \
+                                               (long long)(abstime))
+#define __sanitizer_syscall_post__ksem_timedwait(res, id, abstime)             \
+  __sanitizer_syscall_post_impl__ksem_timedwait(res, (long long)(id),          \
+                                                (long long)(abstime))
+#define __sanitizer_syscall_pre_mq_open(name, oflag, mode, attr)               \
+  __sanitizer_syscall_pre_impl_mq_open((long long)(name), (long long)(oflag),  \
+                                       (long long)(mode), (long long)(attr))
+#define __sanitizer_syscall_post_mq_open(res, name, oflag, mode, attr)         \
+  __sanitizer_syscall_post_impl_mq_open(res, (long long)(name),                \
+                                        (long long)(oflag), (long long)(mode), \
+                                        (long long)(attr))
+#define __sanitizer_syscall_pre_mq_close(mqdes)                                \
+  __sanitizer_syscall_pre_impl_mq_close((long long)(mqdes))
+#define __sanitizer_syscall_post_mq_close(res, mqdes)                          \
+  __sanitizer_syscall_post_impl_mq_close(res, (long long)(mqdes))
+#define __sanitizer_syscall_pre_mq_unlink(name)                                \
+  __sanitizer_syscall_pre_impl_mq_unlink((long long)(name))
+#define __sanitizer_syscall_post_mq_unlink(res, name)                          \
+  __sanitizer_syscall_post_impl_mq_unlink(res, (long long)(name))
+#define __sanitizer_syscall_pre_mq_getattr(mqdes, mqstat)                      \
+  __sanitizer_syscall_pre_impl_mq_getattr((long long)(mqdes),                  \
+                                          (long long)(mqstat))
+#define __sanitizer_syscall_post_mq_getattr(res, mqdes, mqstat)                \
+  __sanitizer_syscall_post_impl_mq_getattr(res, (long long)(mqdes),            \
+                                           (long long)(mqstat))
+#define __sanitizer_syscall_pre_mq_setattr(mqdes, mqstat, omqstat)             \
+  __sanitizer_syscall_pre_impl_mq_setattr(                                     \
+      (long long)(mqdes), (long long)(mqstat), (long long)(omqstat))
+#define __sanitizer_syscall_post_mq_setattr(res, mqdes, mqstat, omqstat)       \
+  __sanitizer_syscall_post_impl_mq_setattr(                                    \
+      res, (long long)(mqdes), (long long)(mqstat), (long long)(omqstat))
+#define __sanitizer_syscall_pre_mq_notify(mqdes, notification)                 \
+  __sanitizer_syscall_pre_impl_mq_notify((long long)(mqdes),                   \
+                                         (long long)(notification))
+#define __sanitizer_syscall_post_mq_notify(res, mqdes, notification)           \
+  __sanitizer_syscall_post_impl_mq_notify(res, (long long)(mqdes),             \
+                                          (long long)(notification))
+#define __sanitizer_syscall_pre_mq_send(mqdes, msg_ptr, msg_len, msg_prio)     \
+  __sanitizer_syscall_pre_impl_mq_send(                                        \
+      (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),          \
+      (long long)(msg_prio))
+#define __sanitizer_syscall_post_mq_send(res, mqdes, msg_ptr, msg_len,         \
+                                         msg_prio)                             \
+  __sanitizer_syscall_post_impl_mq_send(                                       \
+      res, (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),     \
+      (long long)(msg_prio))
+#define __sanitizer_syscall_pre_mq_receive(mqdes, msg_ptr, msg_len, msg_prio)  \
+  __sanitizer_syscall_pre_impl_mq_receive(                                     \
+      (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),          \
+      (long long)(msg_prio))
+#define __sanitizer_syscall_post_mq_receive(res, mqdes, msg_ptr, msg_len,      \
+                                            msg_prio)                          \
+  __sanitizer_syscall_post_impl_mq_receive(                                    \
+      res, (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),     \
+      (long long)(msg_prio))
+#define __sanitizer_syscall_pre_compat_50_mq_timedsend(                        \
+    mqdes, msg_ptr, msg_len, msg_prio, abs_timeout)                            \
+  __sanitizer_syscall_pre_impl_compat_50_mq_timedsend(                         \
+      (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),          \
+      (long long)(msg_prio), (long long)(abs_timeout))
+#define __sanitizer_syscall_post_compat_50_mq_timedsend(                       \
+    res, mqdes, msg_ptr, msg_len, msg_prio, abs_timeout)                       \
+  __sanitizer_syscall_post_impl_compat_50_mq_timedsend(                        \
+      res, (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),     \
+      (long long)(msg_prio), (long long)(abs_timeout))
+#define __sanitizer_syscall_pre_compat_50_mq_timedreceive(                     \
+    mqdes, msg_ptr, msg_len, msg_prio, abs_timeout)                            \
+  __sanitizer_syscall_pre_impl_compat_50_mq_timedreceive(                      \
+      (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),          \
+      (long long)(msg_prio), (long long)(abs_timeout))
+#define __sanitizer_syscall_post_compat_50_mq_timedreceive(                    \
+    res, mqdes, msg_ptr, msg_len, msg_prio, abs_timeout)                       \
+  __sanitizer_syscall_post_impl_compat_50_mq_timedreceive(                     \
+      res, (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),     \
+      (long long)(msg_prio), (long long)(abs_timeout))
+/* syscall 267 has been skipped */
+/* syscall 268 has been skipped */
+/* syscall 269 has been skipped */
+#define __sanitizer_syscall_pre___posix_rename(from, to)                       \
+  __sanitizer_syscall_pre_impl___posix_rename((long long)(from),               \
+                                              (long long)(to))
+#define __sanitizer_syscall_post___posix_rename(res, from, to)                 \
+  __sanitizer_syscall_post_impl___posix_rename(res, (long long)(from),         \
+                                               (long long)(to))
+#define __sanitizer_syscall_pre_swapctl(cmd, arg, misc)                        \
+  __sanitizer_syscall_pre_impl_swapctl((long long)(cmd), (long long)(arg),     \
+                                       (long long)(misc))
+#define __sanitizer_syscall_post_swapctl(res, cmd, arg, misc)                  \
+  __sanitizer_syscall_post_impl_swapctl(res, (long long)(cmd),                 \
+                                        (long long)(arg), (long long)(misc))
+#define __sanitizer_syscall_pre_compat_30_getdents(fd, buf, count)             \
+  __sanitizer_syscall_pre_impl_compat_30_getdents(                             \
+      (long long)(fd), (long long)(buf), (long long)(count))
+#define __sanitizer_syscall_post_compat_30_getdents(res, fd, buf, count)       \
+  __sanitizer_syscall_post_impl_compat_30_getdents(                            \
+      res, (long long)(fd), (long long)(buf), (long long)(count))
+#define __sanitizer_syscall_pre_minherit(addr, len, inherit)                   \
+  __sanitizer_syscall_pre_impl_minherit((long long)(addr), (long long)(len),   \
+                                        (long long)(inherit))
+#define __sanitizer_syscall_post_minherit(res, addr, len, inherit)             \
+  __sanitizer_syscall_post_impl_minherit(                                      \
+      res, (long long)(addr), (long long)(len), (long long)(inherit))
+#define __sanitizer_syscall_pre_lchmod(path, mode)                             \
+  __sanitizer_syscall_pre_impl_lchmod((long long)(path), (long long)(mode))
+#define __sanitizer_syscall_post_lchmod(res, path, mode)                       \
+  __sanitizer_syscall_post_impl_lchmod(res, (long long)(path),                 \
+                                       (long long)(mode))
+#define __sanitizer_syscall_pre_lchown(path, uid, gid)                         \
+  __sanitizer_syscall_pre_impl_lchown((long long)(path), (long long)(uid),     \
+                                      (long long)(gid))
+#define __sanitizer_syscall_post_lchown(res, path, uid, gid)                   \
+  __sanitizer_syscall_post_impl_lchown(res, (long long)(path),                 \
+                                       (long long)(uid), (long long)(gid))
+#define __sanitizer_syscall_pre_compat_50_lutimes(path, tptr)                  \
+  __sanitizer_syscall_pre_impl_compat_50_lutimes((long long)(path),            \
+                                                 (long long)(tptr))
+#define __sanitizer_syscall_post_compat_50_lutimes(res, path, tptr)            \
+  __sanitizer_syscall_post_impl_compat_50_lutimes(res, (long long)(path),      \
+                                                  (long long)(tptr))
+#define __sanitizer_syscall_pre___msync13(addr, len, flags)                    \
+  __sanitizer_syscall_pre_impl___msync13((long long)(addr), (long long)(len),  \
+                                         (long long)(flags))
+#define __sanitizer_syscall_post___msync13(res, addr, len, flags)              \
+  __sanitizer_syscall_post_impl___msync13(                                     \
+      res, (long long)(addr), (long long)(len), (long long)(flags))
+#define __sanitizer_syscall_pre_compat_30___stat13(path, ub)                   \
+  __sanitizer_syscall_pre_impl_compat_30___stat13((long long)(path),           \
+                                                  (long long)(ub))
+#define __sanitizer_syscall_post_compat_30___stat13(res, path, ub)             \
+  __sanitizer_syscall_post_impl_compat_30___stat13(res, (long long)(path),     \
+                                                   (long long)(ub))
+#define __sanitizer_syscall_pre_compat_30___fstat13(fd, sb)                    \
+  __sanitizer_syscall_pre_impl_compat_30___fstat13((long long)(fd),            \
+                                                   (long long)(sb))
+#define __sanitizer_syscall_post_compat_30___fstat13(res, fd, sb)              \
+  __sanitizer_syscall_post_impl_compat_30___fstat13(res, (long long)(fd),      \
+                                                    (long long)(sb))
+#define __sanitizer_syscall_pre_compat_30___lstat13(path, ub)                  \
+  __sanitizer_syscall_pre_impl_compat_30___lstat13((long long)(path),          \
+                                                   (long long)(ub))
+#define __sanitizer_syscall_post_compat_30___lstat13(res, path, ub)            \
+  __sanitizer_syscall_post_impl_compat_30___lstat13(res, (long long)(path),    \
+                                                    (long long)(ub))
+#define __sanitizer_syscall_pre___sigaltstack14(nss, oss)                      \
+  __sanitizer_syscall_pre_impl___sigaltstack14((long long)(nss),               \
+                                               (long long)(oss))
+#define __sanitizer_syscall_post___sigaltstack14(res, nss, oss)                \
+  __sanitizer_syscall_post_impl___sigaltstack14(res, (long long)(nss),         \
+                                                (long long)(oss))
+#define __sanitizer_syscall_pre___vfork14()                                    \
+  __sanitizer_syscall_pre_impl___vfork14()
+#define __sanitizer_syscall_post___vfork14(res)                                \
+  __sanitizer_syscall_post_impl___vfork14(res)
+#define __sanitizer_syscall_pre___posix_chown(path, uid, gid)                  \
+  __sanitizer_syscall_pre_impl___posix_chown(                                  \
+      (long long)(path), (long long)(uid), (long long)(gid))
+#define __sanitizer_syscall_post___posix_chown(res, path, uid, gid)            \
+  __sanitizer_syscall_post_impl___posix_chown(                                 \
+      res, (long long)(path), (long long)(uid), (long long)(gid))
+#define __sanitizer_syscall_pre___posix_fchown(fd, uid, gid)                   \
+  __sanitizer_syscall_pre_impl___posix_fchown(                                 \
+      (long long)(fd), (long long)(uid), (long long)(gid))
+#define __sanitizer_syscall_post___posix_fchown(res, fd, uid, gid)             \
+  __sanitizer_syscall_post_impl___posix_fchown(                                \
+      res, (long long)(fd), (long long)(uid), (long long)(gid))
+#define __sanitizer_syscall_pre___posix_lchown(path, uid, gid)                 \
+  __sanitizer_syscall_pre_impl___posix_lchown(                                 \
+      (long long)(path), (long long)(uid), (long long)(gid))
+#define __sanitizer_syscall_post___posix_lchown(res, path, uid, gid)           \
+  __sanitizer_syscall_post_impl___posix_lchown(                                \
+      res, (long long)(path), (long long)(uid), (long long)(gid))
+#define __sanitizer_syscall_pre_getsid(pid)                                    \
+  __sanitizer_syscall_pre_impl_getsid((long long)(pid))
+#define __sanitizer_syscall_post_getsid(res, pid)                              \
+  __sanitizer_syscall_post_impl_getsid(res, (long long)(pid))
+#define __sanitizer_syscall_pre___clone(flags, stack)                          \
+  __sanitizer_syscall_pre_impl___clone((long long)(flags), (long long)(stack))
+#define __sanitizer_syscall_post___clone(res, flags, stack)                    \
+  __sanitizer_syscall_post_impl___clone(res, (long long)(flags),               \
+                                        (long long)(stack))
+#define __sanitizer_syscall_pre_fktrace(fd, ops, facs, pid)                    \
+  __sanitizer_syscall_pre_impl_fktrace((long long)(fd), (long long)(ops),      \
+                                       (long long)(facs), (long long)(pid))
+#define __sanitizer_syscall_post_fktrace(res, fd, ops, facs, pid)              \
+  __sanitizer_syscall_post_impl_fktrace(res, (long long)(fd),                  \
+                                        (long long)(ops), (long long)(facs),   \
+                                        (long long)(pid))
+#define __sanitizer_syscall_pre_preadv(fd, iovp, iovcnt, PAD, offset)          \
+  __sanitizer_syscall_pre_impl_preadv((long long)(fd), (long long)(iovp),      \
+                                      (long long)(iovcnt), (long long)(PAD),   \
+                                      (long long)(offset))
+#define __sanitizer_syscall_post_preadv(res, fd, iovp, iovcnt, PAD, offset)    \
+  __sanitizer_syscall_post_impl_preadv(res, (long long)(fd),                   \
+                                       (long long)(iovp), (long long)(iovcnt), \
+                                       (long long)(PAD), (long long)(offset))
+#define __sanitizer_syscall_pre_pwritev(fd, iovp, iovcnt, PAD, offset)         \
+  __sanitizer_syscall_pre_impl_pwritev((long long)(fd), (long long)(iovp),     \
+                                       (long long)(iovcnt), (long long)(PAD),  \
+                                       (long long)(offset))
+#define __sanitizer_syscall_post_pwritev(res, fd, iovp, iovcnt, PAD, offset)   \
+  __sanitizer_syscall_post_impl_pwritev(                                       \
+      res, (long long)(fd), (long long)(iovp), (long long)(iovcnt),            \
+      (long long)(PAD), (long long)(offset))
+#define __sanitizer_syscall_pre_compat_16___sigaction14(signum, nsa, osa)      \
+  __sanitizer_syscall_pre_impl_compat_16___sigaction14(                        \
+      (long long)(signum), (long long)(nsa), (long long)(osa))
+#define __sanitizer_syscall_post_compat_16___sigaction14(res, signum, nsa,     \
+                                                         osa)                  \
+  __sanitizer_syscall_post_impl_compat_16___sigaction14(                       \
+      res, (long long)(signum), (long long)(nsa), (long long)(osa))
+#define __sanitizer_syscall_pre___sigpending14(set)                            \
+  __sanitizer_syscall_pre_impl___sigpending14((long long)(set))
+#define __sanitizer_syscall_post___sigpending14(res, set)                      \
+  __sanitizer_syscall_post_impl___sigpending14(res, (long long)(set))
+#define __sanitizer_syscall_pre___sigprocmask14(how, set, oset)                \
+  __sanitizer_syscall_pre_impl___sigprocmask14(                                \
+      (long long)(how), (long long)(set), (long long)(oset))
+#define __sanitizer_syscall_post___sigprocmask14(res, how, set, oset)          \
+  __sanitizer_syscall_post_impl___sigprocmask14(                               \
+      res, (long long)(how), (long long)(set), (long long)(oset))
+#define __sanitizer_syscall_pre___sigsuspend14(set)                            \
+  __sanitizer_syscall_pre_impl___sigsuspend14((long long)(set))
+#define __sanitizer_syscall_post___sigsuspend14(res, set)                      \
+  __sanitizer_syscall_post_impl___sigsuspend14(res, (long long)(set))
+#define __sanitizer_syscall_pre_compat_16___sigreturn14(sigcntxp)              \
+  __sanitizer_syscall_pre_impl_compat_16___sigreturn14((long long)(sigcntxp))
+#define __sanitizer_syscall_post_compat_16___sigreturn14(res, sigcntxp)        \
+  __sanitizer_syscall_post_impl_compat_16___sigreturn14(res,                   \
+                                                        (long long)(sigcntxp))
+#define __sanitizer_syscall_pre___getcwd(bufp, length)                         \
+  __sanitizer_syscall_pre_impl___getcwd((long long)(bufp), (long long)(length))
+#define __sanitizer_syscall_post___getcwd(res, bufp, length)                   \
+  __sanitizer_syscall_post_impl___getcwd(res, (long long)(bufp),               \
+                                         (long long)(length))
+#define __sanitizer_syscall_pre_fchroot(fd)                                    \
+  __sanitizer_syscall_pre_impl_fchroot((long long)(fd))
+#define __sanitizer_syscall_post_fchroot(res, fd)                              \
+  __sanitizer_syscall_post_impl_fchroot(res, (long long)(fd))
+#define __sanitizer_syscall_pre_compat_30_fhopen(fhp, flags)                   \
+  __sanitizer_syscall_pre_impl_compat_30_fhopen((long long)(fhp),              \
+                                                (long long)(flags))
+#define __sanitizer_syscall_post_compat_30_fhopen(res, fhp, flags)             \
+  __sanitizer_syscall_post_impl_compat_30_fhopen(res, (long long)(fhp),        \
+                                                 (long long)(flags))
+#define __sanitizer_syscall_pre_compat_30_fhstat(fhp, sb)                      \
+  __sanitizer_syscall_pre_impl_compat_30_fhstat((long long)(fhp),              \
+                                                (long long)(sb))
+#define __sanitizer_syscall_post_compat_30_fhstat(res, fhp, sb)                \
+  __sanitizer_syscall_post_impl_compat_30_fhstat(res, (long long)(fhp),        \
+                                                 (long long)(sb))
+#define __sanitizer_syscall_pre_compat_20_fhstatfs(fhp, buf)                   \
+  __sanitizer_syscall_pre_impl_compat_20_fhstatfs((long long)(fhp),            \
+                                                  (long long)(buf))
+#define __sanitizer_syscall_post_compat_20_fhstatfs(res, fhp, buf)             \
+  __sanitizer_syscall_post_impl_compat_20_fhstatfs(res, (long long)(fhp),      \
+                                                   (long long)(buf))
+#define __sanitizer_syscall_pre_compat_50_____semctl13(semid, semnum, cmd,     \
+                                                       arg)                    \
+  __sanitizer_syscall_pre_impl_compat_50_____semctl13(                         \
+      (long long)(semid), (long long)(semnum), (long long)(cmd),               \
+      (long long)(arg))
+#define __sanitizer_syscall_post_compat_50_____semctl13(res, semid, semnum,    \
+                                                        cmd, arg)              \
+  __sanitizer_syscall_post_impl_compat_50_____semctl13(                        \
+      res, (long long)(semid), (long long)(semnum), (long long)(cmd),          \
+      (long long)(arg))
+#define __sanitizer_syscall_pre_compat_50___msgctl13(msqid, cmd, buf)          \
+  __sanitizer_syscall_pre_impl_compat_50___msgctl13(                           \
+      (long long)(msqid), (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_post_compat_50___msgctl13(res, msqid, cmd, buf)    \
+  __sanitizer_syscall_post_impl_compat_50___msgctl13(                          \
+      res, (long long)(msqid), (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_pre_compat_50___shmctl13(shmid, cmd, buf)          \
+  __sanitizer_syscall_pre_impl_compat_50___shmctl13(                           \
+      (long long)(shmid), (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_post_compat_50___shmctl13(res, shmid, cmd, buf)    \
+  __sanitizer_syscall_post_impl_compat_50___shmctl13(                          \
+      res, (long long)(shmid), (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_pre_lchflags(path, flags)                          \
+  __sanitizer_syscall_pre_impl_lchflags((long long)(path), (long long)(flags))
+#define __sanitizer_syscall_post_lchflags(res, path, flags)                    \
+  __sanitizer_syscall_post_impl_lchflags(res, (long long)(path),               \
+                                         (long long)(flags))
+#define __sanitizer_syscall_pre_issetugid()                                    \
+  __sanitizer_syscall_pre_impl_issetugid()
+#define __sanitizer_syscall_post_issetugid(res)                                \
+  __sanitizer_syscall_post_impl_issetugid(res)
+#define __sanitizer_syscall_pre_utrace(label, addr, len)                       \
+  __sanitizer_syscall_pre_impl_utrace((long long)(label), (long long)(addr),   \
+                                      (long long)(len))
+#define __sanitizer_syscall_post_utrace(res, label, addr, len)                 \
+  __sanitizer_syscall_post_impl_utrace(res, (long long)(label),                \
+                                       (long long)(addr), (long long)(len))
+#define __sanitizer_syscall_pre_getcontext(ucp)                                \
+  __sanitizer_syscall_pre_impl_getcontext((long long)(ucp))
+#define __sanitizer_syscall_post_getcontext(res, ucp)                          \
+  __sanitizer_syscall_post_impl_getcontext(res, (long long)(ucp))
+#define __sanitizer_syscall_pre_setcontext(ucp)                                \
+  __sanitizer_syscall_pre_impl_setcontext((long long)(ucp))
+#define __sanitizer_syscall_post_setcontext(res, ucp)                          \
+  __sanitizer_syscall_post_impl_setcontext(res, (long long)(ucp))
+#define __sanitizer_syscall_pre__lwp_create(ucp, flags, new_lwp)               \
+  __sanitizer_syscall_pre_impl__lwp_create(                                    \
+      (long long)(ucp), (long long)(flags), (long long)(new_lwp))
+#define __sanitizer_syscall_post__lwp_create(res, ucp, flags, new_lwp)         \
+  __sanitizer_syscall_post_impl__lwp_create(                                   \
+      res, (long long)(ucp), (long long)(flags), (long long)(new_lwp))
+#define __sanitizer_syscall_pre__lwp_exit()                                    \
+  __sanitizer_syscall_pre_impl__lwp_exit()
+#define __sanitizer_syscall_post__lwp_exit(res)                                \
+  __sanitizer_syscall_post_impl__lwp_exit(res)
+#define __sanitizer_syscall_pre__lwp_self()                                    \
+  __sanitizer_syscall_pre_impl__lwp_self()
+#define __sanitizer_syscall_post__lwp_self(res)                                \
+  __sanitizer_syscall_post_impl__lwp_self(res)
+#define __sanitizer_syscall_pre__lwp_wait(wait_for, departed)                  \
+  __sanitizer_syscall_pre_impl__lwp_wait((long long)(wait_for),                \
+                                         (long long)(departed))
+#define __sanitizer_syscall_post__lwp_wait(res, wait_for, departed)            \
+  __sanitizer_syscall_post_impl__lwp_wait(res, (long long)(wait_for),          \
+                                          (long long)(departed))
+#define __sanitizer_syscall_pre__lwp_suspend(target)                           \
+  __sanitizer_syscall_pre_impl__lwp_suspend((long long)(target))
+#define __sanitizer_syscall_post__lwp_suspend(res, target)                     \
+  __sanitizer_syscall_post_impl__lwp_suspend(res, (long long)(target))
+#define __sanitizer_syscall_pre__lwp_continue(target)                          \
+  __sanitizer_syscall_pre_impl__lwp_continue((long long)(target))
+#define __sanitizer_syscall_post__lwp_continue(res, target)                    \
+  __sanitizer_syscall_post_impl__lwp_continue(res, (long long)(target))
+#define __sanitizer_syscall_pre__lwp_wakeup(target)                            \
+  __sanitizer_syscall_pre_impl__lwp_wakeup((long long)(target))
+#define __sanitizer_syscall_post__lwp_wakeup(res, target)                      \
+  __sanitizer_syscall_post_impl__lwp_wakeup(res, (long long)(target))
+#define __sanitizer_syscall_pre__lwp_getprivate()                              \
+  __sanitizer_syscall_pre_impl__lwp_getprivate()
+#define __sanitizer_syscall_post__lwp_getprivate(res)                          \
+  __sanitizer_syscall_post_impl__lwp_getprivate(res)
+#define __sanitizer_syscall_pre__lwp_setprivate(ptr)                           \
+  __sanitizer_syscall_pre_impl__lwp_setprivate((long long)(ptr))
+#define __sanitizer_syscall_post__lwp_setprivate(res, ptr)                     \
+  __sanitizer_syscall_post_impl__lwp_setprivate(res, (long long)(ptr))
+#define __sanitizer_syscall_pre__lwp_kill(target, signo)                       \
+  __sanitizer_syscall_pre_impl__lwp_kill((long long)(target),                  \
+                                         (long long)(signo))
+#define __sanitizer_syscall_post__lwp_kill(res, target, signo)                 \
+  __sanitizer_syscall_post_impl__lwp_kill(res, (long long)(target),            \
+                                          (long long)(signo))
+#define __sanitizer_syscall_pre__lwp_detach(target)                            \
+  __sanitizer_syscall_pre_impl__lwp_detach((long long)(target))
+#define __sanitizer_syscall_post__lwp_detach(res, target)                      \
+  __sanitizer_syscall_post_impl__lwp_detach(res, (long long)(target))
+#define __sanitizer_syscall_pre_compat_50__lwp_park(ts, unpark, hint,          \
+                                                    unparkhint)                \
+  __sanitizer_syscall_pre_impl_compat_50__lwp_park(                            \
+      (long long)(ts), (long long)(unpark), (long long)(hint),                 \
+      (long long)(unparkhint))
+#define __sanitizer_syscall_post_compat_50__lwp_park(res, ts, unpark, hint,    \
+                                                     unparkhint)               \
+  __sanitizer_syscall_post_impl_compat_50__lwp_park(                           \
+      res, (long long)(ts), (long long)(unpark), (long long)(hint),            \
+      (long long)(unparkhint))
+#define __sanitizer_syscall_pre__lwp_unpark(target, hint)                      \
+  __sanitizer_syscall_pre_impl__lwp_unpark((long long)(target),                \
+                                           (long long)(hint))
+#define __sanitizer_syscall_post__lwp_unpark(res, target, hint)                \
+  __sanitizer_syscall_post_impl__lwp_unpark(res, (long long)(target),          \
+                                            (long long)(hint))
+#define __sanitizer_syscall_pre__lwp_unpark_all(targets, ntargets, hint)       \
+  __sanitizer_syscall_pre_impl__lwp_unpark_all(                                \
+      (long long)(targets), (long long)(ntargets), (long long)(hint))
+#define __sanitizer_syscall_post__lwp_unpark_all(res, targets, ntargets, hint) \
+  __sanitizer_syscall_post_impl__lwp_unpark_all(                               \
+      res, (long long)(targets), (long long)(ntargets), (long long)(hint))
+#define __sanitizer_syscall_pre__lwp_setname(target, name)                     \
+  __sanitizer_syscall_pre_impl__lwp_setname((long long)(target),               \
+                                            (long long)(name))
+#define __sanitizer_syscall_post__lwp_setname(res, target, name)               \
+  __sanitizer_syscall_post_impl__lwp_setname(res, (long long)(target),         \
+                                             (long long)(name))
+#define __sanitizer_syscall_pre__lwp_getname(target, name, len)                \
+  __sanitizer_syscall_pre_impl__lwp_getname(                                   \
+      (long long)(target), (long long)(name), (long long)(len))
+#define __sanitizer_syscall_post__lwp_getname(res, target, name, len)          \
+  __sanitizer_syscall_post_impl__lwp_getname(                                  \
+      res, (long long)(target), (long long)(name), (long long)(len))
+#define __sanitizer_syscall_pre__lwp_ctl(features, address)                    \
+  __sanitizer_syscall_pre_impl__lwp_ctl((long long)(features),                 \
+                                        (long long)(address))
+#define __sanitizer_syscall_post__lwp_ctl(res, features, address)              \
+  __sanitizer_syscall_post_impl__lwp_ctl(res, (long long)(features),           \
+                                         (long long)(address))
+/* syscall 326 has been skipped */
+/* syscall 327 has been skipped */
+/* syscall 328 has been skipped */
+/* syscall 329 has been skipped */
+#define __sanitizer_syscall_pre_compat_60_sa_register(newv, oldv, flags,       \
+                                                      stackinfo_offset)        \
+  __sanitizer_syscall_pre_impl_compat_60_sa_register(                          \
+      (long long)(newv), (long long)(oldv), (long long)(flags),                \
+      (long long)(stackinfo_offset))
+#define __sanitizer_syscall_post_compat_60_sa_register(res, newv, oldv, flags, \
+                                                       stackinfo_offset)       \
+  __sanitizer_syscall_post_impl_compat_60_sa_register(                         \
+      res, (long long)(newv), (long long)(oldv), (long long)(flags),           \
+      (long long)(stackinfo_offset))
+#define __sanitizer_syscall_pre_compat_60_sa_stacks(num, stacks)               \
+  __sanitizer_syscall_pre_impl_compat_60_sa_stacks((long long)(num),           \
+                                                   (long long)(stacks))
+#define __sanitizer_syscall_post_compat_60_sa_stacks(res, num, stacks)         \
+  __sanitizer_syscall_post_impl_compat_60_sa_stacks(res, (long long)(num),     \
+                                                    (long long)(stacks))
+#define __sanitizer_syscall_pre_compat_60_sa_enable()                          \
+  __sanitizer_syscall_pre_impl_compat_60_sa_enable()
+#define __sanitizer_syscall_post_compat_60_sa_enable(res)                      \
+  __sanitizer_syscall_post_impl_compat_60_sa_enable(res)
+#define __sanitizer_syscall_pre_compat_60_sa_setconcurrency(concurrency)       \
+  __sanitizer_syscall_pre_impl_compat_60_sa_setconcurrency(                    \
+      (long long)(concurrency))
+#define __sanitizer_syscall_post_compat_60_sa_setconcurrency(res, concurrency) \
+  __sanitizer_syscall_post_impl_compat_60_sa_setconcurrency(                   \
+      res, (long long)(concurrency))
+#define __sanitizer_syscall_pre_compat_60_sa_yield()                           \
+  __sanitizer_syscall_pre_impl_compat_60_sa_yield()
+#define __sanitizer_syscall_post_compat_60_sa_yield(res)                       \
+  __sanitizer_syscall_post_impl_compat_60_sa_yield(res)
+#define __sanitizer_syscall_pre_compat_60_sa_preempt(sa_id)                    \
+  __sanitizer_syscall_pre_impl_compat_60_sa_preempt((long long)(sa_id))
+#define __sanitizer_syscall_post_compat_60_sa_preempt(res, sa_id)              \
+  __sanitizer_syscall_post_impl_compat_60_sa_preempt(res, (long long)(sa_id))
+/* syscall 336 has been skipped */
+/* syscall 337 has been skipped */
+/* syscall 338 has been skipped */
+/* syscall 339 has been skipped */
+#define __sanitizer_syscall_pre___sigaction_sigtramp(signum, nsa, osa, tramp,  \
+                                                     vers)                     \
+  __sanitizer_syscall_pre_impl___sigaction_sigtramp(                           \
+      (long long)(signum), (long long)(nsa), (long long)(osa),                 \
+      (long long)(tramp), (long long)(vers))
+#define __sanitizer_syscall_post___sigaction_sigtramp(res, signum, nsa, osa,   \
+                                                      tramp, vers)             \
+  __sanitizer_syscall_post_impl___sigaction_sigtramp(                          \
+      res, (long long)(signum), (long long)(nsa), (long long)(osa),            \
+      (long long)(tramp), (long long)(vers))
+#define __sanitizer_syscall_pre_pmc_get_info(ctr, op, args)                    \
+  __sanitizer_syscall_pre_impl_pmc_get_info((long long)(ctr), (long long)(op), \
+                                            (long long)(args))
+#define __sanitizer_syscall_post_pmc_get_info(res, ctr, op, args)              \
+  __sanitizer_syscall_post_impl_pmc_get_info(                                  \
+      res, (long long)(ctr), (long long)(op), (long long)(args))
+#define __sanitizer_syscall_pre_pmc_control(ctr, op, args)                     \
+  __sanitizer_syscall_pre_impl_pmc_control((long long)(ctr), (long long)(op),  \
+                                           (long long)(args))
+#define __sanitizer_syscall_post_pmc_control(res, ctr, op, args)               \
+  __sanitizer_syscall_post_impl_pmc_control(                                   \
+      res, (long long)(ctr), (long long)(op), (long long)(args))
+#define __sanitizer_syscall_pre_rasctl(addr, len, op)                          \
+  __sanitizer_syscall_pre_impl_rasctl((long long)(addr), (long long)(len),     \
+                                      (long long)(op))
+#define __sanitizer_syscall_post_rasctl(res, addr, len, op)                    \
+  __sanitizer_syscall_post_impl_rasctl(res, (long long)(addr),                 \
+                                       (long long)(len), (long long)(op))
+#define __sanitizer_syscall_pre_kqueue() __sanitizer_syscall_pre_impl_kqueue()
+#define __sanitizer_syscall_post_kqueue(res)                                   \
+  __sanitizer_syscall_post_impl_kqueue(res)
+#define __sanitizer_syscall_pre_compat_50_kevent(fd, changelist, nchanges,     \
+                                                 eventlist, nevents, timeout)  \
+  __sanitizer_syscall_pre_impl_compat_50_kevent(                               \
+      (long long)(fd), (long long)(changelist), (long long)(nchanges),         \
+      (long long)(eventlist), (long long)(nevents), (long long)(timeout))
+#define __sanitizer_syscall_post_compat_50_kevent(                             \
+    res, fd, changelist, nchanges, eventlist, nevents, timeout)                \
+  __sanitizer_syscall_post_impl_compat_50_kevent(                              \
+      res, (long long)(fd), (long long)(changelist), (long long)(nchanges),    \
+      (long long)(eventlist), (long long)(nevents), (long long)(timeout))
+#define __sanitizer_syscall_pre__sched_setparam(pid, lid, policy, params)      \
+  __sanitizer_syscall_pre_impl__sched_setparam(                                \
+      (long long)(pid), (long long)(lid), (long long)(policy),                 \
+      (long long)(params))
+#define __sanitizer_syscall_post__sched_setparam(res, pid, lid, policy,        \
+                                                 params)                       \
+  __sanitizer_syscall_post_impl__sched_setparam(                               \
+      res, (long long)(pid), (long long)(lid), (long long)(policy),            \
+      (long long)(params))
+#define __sanitizer_syscall_pre__sched_getparam(pid, lid, policy, params)      \
+  __sanitizer_syscall_pre_impl__sched_getparam(                                \
+      (long long)(pid), (long long)(lid), (long long)(policy),                 \
+      (long long)(params))
+#define __sanitizer_syscall_post__sched_getparam(res, pid, lid, policy,        \
+                                                 params)                       \
+  __sanitizer_syscall_post_impl__sched_getparam(                               \
+      res, (long long)(pid), (long long)(lid), (long long)(policy),            \
+      (long long)(params))
+#define __sanitizer_syscall_pre__sched_setaffinity(pid, lid, size, cpuset)     \
+  __sanitizer_syscall_pre_impl__sched_setaffinity(                             \
+      (long long)(pid), (long long)(lid), (long long)(size),                   \
+      (long long)(cpuset))
+#define __sanitizer_syscall_post__sched_setaffinity(res, pid, lid, size,       \
+                                                    cpuset)                    \
+  __sanitizer_syscall_post_impl__sched_setaffinity(                            \
+      res, (long long)(pid), (long long)(lid), (long long)(size),              \
+      (long long)(cpuset))
+#define __sanitizer_syscall_pre__sched_getaffinity(pid, lid, size, cpuset)     \
+  __sanitizer_syscall_pre_impl__sched_getaffinity(                             \
+      (long long)(pid), (long long)(lid), (long long)(size),                   \
+      (long long)(cpuset))
+#define __sanitizer_syscall_post__sched_getaffinity(res, pid, lid, size,       \
+                                                    cpuset)                    \
+  __sanitizer_syscall_post_impl__sched_getaffinity(                            \
+      res, (long long)(pid), (long long)(lid), (long long)(size),              \
+      (long long)(cpuset))
+#define __sanitizer_syscall_pre_sched_yield()                                  \
+  __sanitizer_syscall_pre_impl_sched_yield()
+#define __sanitizer_syscall_post_sched_yield(res)                              \
+  __sanitizer_syscall_post_impl_sched_yield(res)
+#define __sanitizer_syscall_pre__sched_protect(priority)                       \
+  __sanitizer_syscall_pre_impl__sched_protect((long long)(priority))
+#define __sanitizer_syscall_post__sched_protect(res, priority)                 \
+  __sanitizer_syscall_post_impl__sched_protect(res, (long long)(priority))
+/* syscall 352 has been skipped */
+/* syscall 353 has been skipped */
+#define __sanitizer_syscall_pre_fsync_range(fd, flags, start, length)          \
+  __sanitizer_syscall_pre_impl_fsync_range(                                    \
+      (long long)(fd), (long long)(flags), (long long)(start),                 \
+      (long long)(length))
+#define __sanitizer_syscall_post_fsync_range(res, fd, flags, start, length)    \
+  __sanitizer_syscall_post_impl_fsync_range(                                   \
+      res, (long long)(fd), (long long)(flags), (long long)(start),            \
+      (long long)(length))
+#define __sanitizer_syscall_pre_uuidgen(store, count)                          \
+  __sanitizer_syscall_pre_impl_uuidgen((long long)(store), (long long)(count))
+#define __sanitizer_syscall_post_uuidgen(res, store, count)                    \
+  __sanitizer_syscall_post_impl_uuidgen(res, (long long)(store),               \
+                                        (long long)(count))
+#define __sanitizer_syscall_pre_getvfsstat(buf, bufsize, flags)                \
+  __sanitizer_syscall_pre_impl_getvfsstat(                                     \
+      (long long)(buf), (long long)(bufsize), (long long)(flags))
+#define __sanitizer_syscall_post_getvfsstat(res, buf, bufsize, flags)          \
+  __sanitizer_syscall_post_impl_getvfsstat(                                    \
+      res, (long long)(buf), (long long)(bufsize), (long long)(flags))
+#define __sanitizer_syscall_pre_statvfs1(path, buf, flags)                     \
+  __sanitizer_syscall_pre_impl_statvfs1((long long)(path), (long long)(buf),   \
+                                        (long long)(flags))
+#define __sanitizer_syscall_post_statvfs1(res, path, buf, flags)               \
+  __sanitizer_syscall_post_impl_statvfs1(res, (long long)(path),               \
+                                         (long long)(buf), (long long)(flags))
+#define __sanitizer_syscall_pre_fstatvfs1(fd, buf, flags)                      \
+  __sanitizer_syscall_pre_impl_fstatvfs1((long long)(fd), (long long)(buf),    \
+                                         (long long)(flags))
+#define __sanitizer_syscall_post_fstatvfs1(res, fd, buf, flags)                \
+  __sanitizer_syscall_post_impl_fstatvfs1(                                     \
+      res, (long long)(fd), (long long)(buf), (long long)(flags))
+#define __sanitizer_syscall_pre_compat_30_fhstatvfs1(fhp, buf, flags)          \
+  __sanitizer_syscall_pre_impl_compat_30_fhstatvfs1(                           \
+      (long long)(fhp), (long long)(buf), (long long)(flags))
+#define __sanitizer_syscall_post_compat_30_fhstatvfs1(res, fhp, buf, flags)    \
+  __sanitizer_syscall_post_impl_compat_30_fhstatvfs1(                          \
+      res, (long long)(fhp), (long long)(buf), (long long)(flags))
+#define __sanitizer_syscall_pre_extattrctl(path, cmd, filename, attrnamespace, \
+                                           attrname)                           \
+  __sanitizer_syscall_pre_impl_extattrctl(                                     \
+      (long long)(path), (long long)(cmd), (long long)(filename),              \
+      (long long)(attrnamespace), (long long)(attrname))
+#define __sanitizer_syscall_post_extattrctl(res, path, cmd, filename,          \
+                                            attrnamespace, attrname)           \
+  __sanitizer_syscall_post_impl_extattrctl(                                    \
+      res, (long long)(path), (long long)(cmd), (long long)(filename),         \
+      (long long)(attrnamespace), (long long)(attrname))
+#define __sanitizer_syscall_pre_extattr_set_file(path, attrnamespace,          \
+                                                 attrname, data, nbytes)       \
+  __sanitizer_syscall_pre_impl_extattr_set_file(                               \
+      (long long)(path), (long long)(attrnamespace), (long long)(attrname),    \
+      (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_set_file(res, path, attrnamespace,    \
+                                                  attrname, data, nbytes)      \
+  __sanitizer_syscall_post_impl_extattr_set_file(                              \
+      res, (long long)(path), (long long)(attrnamespace),                      \
+      (long long)(attrname), (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_pre_extattr_get_file(path, attrnamespace,          \
+                                                 attrname, data, nbytes)       \
+  __sanitizer_syscall_pre_impl_extattr_get_file(                               \
+      (long long)(path), (long long)(attrnamespace), (long long)(attrname),    \
+      (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_get_file(res, path, attrnamespace,    \
+                                                  attrname, data, nbytes)      \
+  __sanitizer_syscall_post_impl_extattr_get_file(                              \
+      res, (long long)(path), (long long)(attrnamespace),                      \
+      (long long)(attrname), (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_pre_extattr_delete_file(path, attrnamespace,       \
+                                                    attrname)                  \
+  __sanitizer_syscall_pre_impl_extattr_delete_file(                            \
+      (long long)(path), (long long)(attrnamespace), (long long)(attrname))
+#define __sanitizer_syscall_post_extattr_delete_file(res, path, attrnamespace, \
+                                                     attrname)                 \
+  __sanitizer_syscall_post_impl_extattr_delete_file(                           \
+      res, (long long)(path), (long long)(attrnamespace),                      \
+      (long long)(attrname))
+#define __sanitizer_syscall_pre_extattr_set_fd(fd, attrnamespace, attrname,    \
+                                               data, nbytes)                   \
+  __sanitizer_syscall_pre_impl_extattr_set_fd(                                 \
+      (long long)(fd), (long long)(attrnamespace), (long long)(attrname),      \
+      (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_set_fd(res, fd, attrnamespace,        \
+                                                attrname, data, nbytes)        \
+  __sanitizer_syscall_post_impl_extattr_set_fd(                                \
+      res, (long long)(fd), (long long)(attrnamespace), (long long)(attrname), \
+      (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_pre_extattr_get_fd(fd, attrnamespace, attrname,    \
+                                               data, nbytes)                   \
+  __sanitizer_syscall_pre_impl_extattr_get_fd(                                 \
+      (long long)(fd), (long long)(attrnamespace), (long long)(attrname),      \
+      (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_get_fd(res, fd, attrnamespace,        \
+                                                attrname, data, nbytes)        \
+  __sanitizer_syscall_post_impl_extattr_get_fd(                                \
+      res, (long long)(fd), (long long)(attrnamespace), (long long)(attrname), \
+      (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_pre_extattr_delete_fd(fd, attrnamespace, attrname) \
+  __sanitizer_syscall_pre_impl_extattr_delete_fd(                              \
+      (long long)(fd), (long long)(attrnamespace), (long long)(attrname))
+#define __sanitizer_syscall_post_extattr_delete_fd(res, fd, attrnamespace,     \
+                                                   attrname)                   \
+  __sanitizer_syscall_post_impl_extattr_delete_fd(                             \
+      res, (long long)(fd), (long long)(attrnamespace), (long long)(attrname))
+#define __sanitizer_syscall_pre_extattr_set_link(path, attrnamespace,          \
+                                                 attrname, data, nbytes)       \
+  __sanitizer_syscall_pre_impl_extattr_set_link(                               \
+      (long long)(path), (long long)(attrnamespace), (long long)(attrname),    \
+      (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_set_link(res, path, attrnamespace,    \
+                                                  attrname, data, nbytes)      \
+  __sanitizer_syscall_post_impl_extattr_set_link(                              \
+      res, (long long)(path), (long long)(attrnamespace),                      \
+      (long long)(attrname), (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_pre_extattr_get_link(path, attrnamespace,          \
+                                                 attrname, data, nbytes)       \
+  __sanitizer_syscall_pre_impl_extattr_get_link(                               \
+      (long long)(path), (long long)(attrnamespace), (long long)(attrname),    \
+      (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_get_link(res, path, attrnamespace,    \
+                                                  attrname, data, nbytes)      \
+  __sanitizer_syscall_post_impl_extattr_get_link(                              \
+      res, (long long)(path), (long long)(attrnamespace),                      \
+      (long long)(attrname), (long long)(data), (long long)(nbytes))
+#define __sanitizer_syscall_pre_extattr_delete_link(path, attrnamespace,       \
+                                                    attrname)                  \
+  __sanitizer_syscall_pre_impl_extattr_delete_link(                            \
+      (long long)(path), (long long)(attrnamespace), (long long)(attrname))
+#define __sanitizer_syscall_post_extattr_delete_link(res, path, attrnamespace, \
+                                                     attrname)                 \
+  __sanitizer_syscall_post_impl_extattr_delete_link(                           \
+      res, (long long)(path), (long long)(attrnamespace),                      \
+      (long long)(attrname))
+#define __sanitizer_syscall_pre_extattr_list_fd(fd, attrnamespace, data,       \
+                                                nbytes)                        \
+  __sanitizer_syscall_pre_impl_extattr_list_fd(                                \
+      (long long)(fd), (long long)(attrnamespace), (long long)(data),          \
+      (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_list_fd(res, fd, attrnamespace, data, \
+                                                 nbytes)                       \
+  __sanitizer_syscall_post_impl_extattr_list_fd(                               \
+      res, (long long)(fd), (long long)(attrnamespace), (long long)(data),     \
+      (long long)(nbytes))
+#define __sanitizer_syscall_pre_extattr_list_file(path, attrnamespace, data,   \
+                                                  nbytes)                      \
+  __sanitizer_syscall_pre_impl_extattr_list_file(                              \
+      (long long)(path), (long long)(attrnamespace), (long long)(data),        \
+      (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_list_file(res, path, attrnamespace,   \
+                                                   data, nbytes)               \
+  __sanitizer_syscall_post_impl_extattr_list_file(                             \
+      res, (long long)(path), (long long)(attrnamespace), (long long)(data),   \
+      (long long)(nbytes))
+#define __sanitizer_syscall_pre_extattr_list_link(path, attrnamespace, data,   \
+                                                  nbytes)                      \
+  __sanitizer_syscall_pre_impl_extattr_list_link(                              \
+      (long long)(path), (long long)(attrnamespace), (long long)(data),        \
+      (long long)(nbytes))
+#define __sanitizer_syscall_post_extattr_list_link(res, path, attrnamespace,   \
+                                                   data, nbytes)               \
+  __sanitizer_syscall_post_impl_extattr_list_link(                             \
+      res, (long long)(path), (long long)(attrnamespace), (long long)(data),   \
+      (long long)(nbytes))
+#define __sanitizer_syscall_pre_compat_50_pselect(nd, in, ou, ex, ts, mask)    \
+  __sanitizer_syscall_pre_impl_compat_50_pselect(                              \
+      (long long)(nd), (long long)(in), (long long)(ou), (long long)(ex),      \
+      (long long)(ts), (long long)(mask))
+#define __sanitizer_syscall_post_compat_50_pselect(res, nd, in, ou, ex, ts,    \
+                                                   mask)                       \
+  __sanitizer_syscall_post_impl_compat_50_pselect(                             \
+      res, (long long)(nd), (long long)(in), (long long)(ou), (long long)(ex), \
+      (long long)(ts), (long long)(mask))
+#define __sanitizer_syscall_pre_compat_50_pollts(fds, nfds, ts, mask)          \
+  __sanitizer_syscall_pre_impl_compat_50_pollts(                               \
+      (long long)(fds), (long long)(nfds), (long long)(ts), (long long)(mask))
+#define __sanitizer_syscall_post_compat_50_pollts(res, fds, nfds, ts, mask)    \
+  __sanitizer_syscall_post_impl_compat_50_pollts(                              \
+      res, (long long)(fds), (long long)(nfds), (long long)(ts),               \
+      (long long)(mask))
+#define __sanitizer_syscall_pre_setxattr(path, name, value, size, flags)       \
+  __sanitizer_syscall_pre_impl_setxattr((long long)(path), (long long)(name),  \
+                                        (long long)(value), (long long)(size), \
+                                        (long long)(flags))
+#define __sanitizer_syscall_post_setxattr(res, path, name, value, size, flags) \
+  __sanitizer_syscall_post_impl_setxattr(                                      \
+      res, (long long)(path), (long long)(name), (long long)(value),           \
+      (long long)(size), (long long)(flags))
+#define __sanitizer_syscall_pre_lsetxattr(path, name, value, size, flags)      \
+  __sanitizer_syscall_pre_impl_lsetxattr(                                      \
+      (long long)(path), (long long)(name), (long long)(value),                \
+      (long long)(size), (long long)(flags))
+#define __sanitizer_syscall_post_lsetxattr(res, path, name, value, size,       \
+                                           flags)                              \
+  __sanitizer_syscall_post_impl_lsetxattr(                                     \
+      res, (long long)(path), (long long)(name), (long long)(value),           \
+      (long long)(size), (long long)(flags))
+#define __sanitizer_syscall_pre_fsetxattr(fd, name, value, size, flags)        \
+  __sanitizer_syscall_pre_impl_fsetxattr(                                      \
+      (long long)(fd), (long long)(name), (long long)(value),                  \
+      (long long)(size), (long long)(flags))
+#define __sanitizer_syscall_post_fsetxattr(res, fd, name, value, size, flags)  \
+  __sanitizer_syscall_post_impl_fsetxattr(                                     \
+      res, (long long)(fd), (long long)(name), (long long)(value),             \
+      (long long)(size), (long long)(flags))
+#define __sanitizer_syscall_pre_getxattr(path, name, value, size)              \
+  __sanitizer_syscall_pre_impl_getxattr((long long)(path), (long long)(name),  \
+                                        (long long)(value), (long long)(size))
+#define __sanitizer_syscall_post_getxattr(res, path, name, value, size)        \
+  __sanitizer_syscall_post_impl_getxattr(                                      \
+      res, (long long)(path), (long long)(name), (long long)(value),           \
+      (long long)(size))
+#define __sanitizer_syscall_pre_lgetxattr(path, name, value, size)             \
+  __sanitizer_syscall_pre_impl_lgetxattr((long long)(path), (long long)(name), \
+                                         (long long)(value),                   \
+                                         (long long)(size))
+#define __sanitizer_syscall_post_lgetxattr(res, path, name, value, size)       \
+  __sanitizer_syscall_post_impl_lgetxattr(                                     \
+      res, (long long)(path), (long long)(name), (long long)(value),           \
+      (long long)(size))
+#define __sanitizer_syscall_pre_fgetxattr(fd, name, value, size)               \
+  __sanitizer_syscall_pre_impl_fgetxattr((long long)(fd), (long long)(name),   \
+                                         (long long)(value),                   \
+                                         (long long)(size))
+#define __sanitizer_syscall_post_fgetxattr(res, fd, name, value, size)         \
+  __sanitizer_syscall_post_impl_fgetxattr(                                     \
+      res, (long long)(fd), (long long)(name), (long long)(value),             \
+      (long long)(size))
+#define __sanitizer_syscall_pre_listxattr(path, list, size)                    \
+  __sanitizer_syscall_pre_impl_listxattr((long long)(path), (long long)(list), \
+                                         (long long)(size))
+#define __sanitizer_syscall_post_listxattr(res, path, list, size)              \
+  __sanitizer_syscall_post_impl_listxattr(                                     \
+      res, (long long)(path), (long long)(list), (long long)(size))
+#define __sanitizer_syscall_pre_llistxattr(path, list, size)                   \
+  __sanitizer_syscall_pre_impl_llistxattr(                                     \
+      (long long)(path), (long long)(list), (long long)(size))
+#define __sanitizer_syscall_post_llistxattr(res, path, list, size)             \
+  __sanitizer_syscall_post_impl_llistxattr(                                    \
+      res, (long long)(path), (long long)(list), (long long)(size))
+#define __sanitizer_syscall_pre_flistxattr(fd, list, size)                     \
+  __sanitizer_syscall_pre_impl_flistxattr((long long)(fd), (long long)(list),  \
+                                          (long long)(size))
+#define __sanitizer_syscall_post_flistxattr(res, fd, list, size)               \
+  __sanitizer_syscall_post_impl_flistxattr(                                    \
+      res, (long long)(fd), (long long)(list), (long long)(size))
+#define __sanitizer_syscall_pre_removexattr(path, name)                        \
+  __sanitizer_syscall_pre_impl_removexattr((long long)(path), (long long)(name))
+#define __sanitizer_syscall_post_removexattr(res, path, name)                  \
+  __sanitizer_syscall_post_impl_removexattr(res, (long long)(path),            \
+                                            (long long)(name))
+#define __sanitizer_syscall_pre_lremovexattr(path, name)                       \
+  __sanitizer_syscall_pre_impl_lremovexattr((long long)(path),                 \
+                                            (long long)(name))
+#define __sanitizer_syscall_post_lremovexattr(res, path, name)                 \
+  __sanitizer_syscall_post_impl_lremovexattr(res, (long long)(path),           \
+                                             (long long)(name))
+#define __sanitizer_syscall_pre_fremovexattr(fd, name)                         \
+  __sanitizer_syscall_pre_impl_fremovexattr((long long)(fd), (long long)(name))
+#define __sanitizer_syscall_post_fremovexattr(res, fd, name)                   \
+  __sanitizer_syscall_post_impl_fremovexattr(res, (long long)(fd),             \
+                                             (long long)(name))
+#define __sanitizer_syscall_pre_compat_50___stat30(path, ub)                   \
+  __sanitizer_syscall_pre_impl_compat_50___stat30((long long)(path),           \
+                                                  (long long)(ub))
+#define __sanitizer_syscall_post_compat_50___stat30(res, path, ub)             \
+  __sanitizer_syscall_post_impl_compat_50___stat30(res, (long long)(path),     \
+                                                   (long long)(ub))
+#define __sanitizer_syscall_pre_compat_50___fstat30(fd, sb)                    \
+  __sanitizer_syscall_pre_impl_compat_50___fstat30((long long)(fd),            \
+                                                   (long long)(sb))
+#define __sanitizer_syscall_post_compat_50___fstat30(res, fd, sb)              \
+  __sanitizer_syscall_post_impl_compat_50___fstat30(res, (long long)(fd),      \
+                                                    (long long)(sb))
+#define __sanitizer_syscall_pre_compat_50___lstat30(path, ub)                  \
+  __sanitizer_syscall_pre_impl_compat_50___lstat30((long long)(path),          \
+                                                   (long long)(ub))
+#define __sanitizer_syscall_post_compat_50___lstat30(res, path, ub)            \
+  __sanitizer_syscall_post_impl_compat_50___lstat30(res, (long long)(path),    \
+                                                    (long long)(ub))
+#define __sanitizer_syscall_pre___getdents30(fd, buf, count)                   \
+  __sanitizer_syscall_pre_impl___getdents30((long long)(fd), (long long)(buf), \
+                                            (long long)(count))
+#define __sanitizer_syscall_post___getdents30(res, fd, buf, count)             \
+  __sanitizer_syscall_post_impl___getdents30(                                  \
+      res, (long long)(fd), (long long)(buf), (long long)(count))
+#define __sanitizer_syscall_pre_posix_fadvise()                                \
+  __sanitizer_syscall_pre_impl_posix_fadvise((long long)())
+#define __sanitizer_syscall_post_posix_fadvise(res)                            \
+  __sanitizer_syscall_post_impl_posix_fadvise(res, (long long)())
+#define __sanitizer_syscall_pre_compat_30___fhstat30(fhp, sb)                  \
+  __sanitizer_syscall_pre_impl_compat_30___fhstat30((long long)(fhp),          \
+                                                    (long long)(sb))
+#define __sanitizer_syscall_post_compat_30___fhstat30(res, fhp, sb)            \
+  __sanitizer_syscall_post_impl_compat_30___fhstat30(res, (long long)(fhp),    \
+                                                     (long long)(sb))
+#define __sanitizer_syscall_pre_compat_50___ntp_gettime30(ntvp)                \
+  __sanitizer_syscall_pre_impl_compat_50___ntp_gettime30((long long)(ntvp))
+#define __sanitizer_syscall_post_compat_50___ntp_gettime30(res, ntvp)          \
+  __sanitizer_syscall_post_impl_compat_50___ntp_gettime30(res,                 \
+                                                          (long long)(ntvp))
+#define __sanitizer_syscall_pre___socket30(domain, type, protocol)             \
+  __sanitizer_syscall_pre_impl___socket30(                                     \
+      (long long)(domain), (long long)(type), (long long)(protocol))
+#define __sanitizer_syscall_post___socket30(res, domain, type, protocol)       \
+  __sanitizer_syscall_post_impl___socket30(                                    \
+      res, (long long)(domain), (long long)(type), (long long)(protocol))
+#define __sanitizer_syscall_pre___getfh30(fname, fhp, fh_size)                 \
+  __sanitizer_syscall_pre_impl___getfh30((long long)(fname), (long long)(fhp), \
+                                         (long long)(fh_size))
+#define __sanitizer_syscall_post___getfh30(res, fname, fhp, fh_size)           \
+  __sanitizer_syscall_post_impl___getfh30(                                     \
+      res, (long long)(fname), (long long)(fhp), (long long)(fh_size))
+#define __sanitizer_syscall_pre___fhopen40(fhp, fh_size, flags)                \
+  __sanitizer_syscall_pre_impl___fhopen40(                                     \
+      (long long)(fhp), (long long)(fh_size), (long long)(flags))
+#define __sanitizer_syscall_post___fhopen40(res, fhp, fh_size, flags)          \
+  __sanitizer_syscall_post_impl___fhopen40(                                    \
+      res, (long long)(fhp), (long long)(fh_size), (long long)(flags))
+#define __sanitizer_syscall_pre___fhstatvfs140(fhp, fh_size, buf, flags)       \
+  __sanitizer_syscall_pre_impl___fhstatvfs140(                                 \
+      (long long)(fhp), (long long)(fh_size), (long long)(buf),                \
+      (long long)(flags))
+#define __sanitizer_syscall_post___fhstatvfs140(res, fhp, fh_size, buf, flags) \
+  __sanitizer_syscall_post_impl___fhstatvfs140(                                \
+      res, (long long)(fhp), (long long)(fh_size), (long long)(buf),           \
+      (long long)(flags))
+#define __sanitizer_syscall_pre_compat_50___fhstat40(fhp, fh_size, sb)         \
+  __sanitizer_syscall_pre_impl_compat_50___fhstat40(                           \
+      (long long)(fhp), (long long)(fh_size), (long long)(sb))
+#define __sanitizer_syscall_post_compat_50___fhstat40(res, fhp, fh_size, sb)   \
+  __sanitizer_syscall_post_impl_compat_50___fhstat40(                          \
+      res, (long long)(fhp), (long long)(fh_size), (long long)(sb))
+#define __sanitizer_syscall_pre_aio_cancel(fildes, aiocbp)                     \
+  __sanitizer_syscall_pre_impl_aio_cancel((long long)(fildes),                 \
+                                          (long long)(aiocbp))
+#define __sanitizer_syscall_post_aio_cancel(res, fildes, aiocbp)               \
+  __sanitizer_syscall_post_impl_aio_cancel(res, (long long)(fildes),           \
+                                           (long long)(aiocbp))
+#define __sanitizer_syscall_pre_aio_error(aiocbp)                              \
+  __sanitizer_syscall_pre_impl_aio_error((long long)(aiocbp))
+#define __sanitizer_syscall_post_aio_error(res, aiocbp)                        \
+  __sanitizer_syscall_post_impl_aio_error(res, (long long)(aiocbp))
+#define __sanitizer_syscall_pre_aio_fsync(op, aiocbp)                          \
+  __sanitizer_syscall_pre_impl_aio_fsync((long long)(op), (long long)(aiocbp))
+#define __sanitizer_syscall_post_aio_fsync(res, op, aiocbp)                    \
+  __sanitizer_syscall_post_impl_aio_fsync(res, (long long)(op),                \
+                                          (long long)(aiocbp))
+#define __sanitizer_syscall_pre_aio_read(aiocbp)                               \
+  __sanitizer_syscall_pre_impl_aio_read((long long)(aiocbp))
+#define __sanitizer_syscall_post_aio_read(res, aiocbp)                         \
+  __sanitizer_syscall_post_impl_aio_read(res, (long long)(aiocbp))
+#define __sanitizer_syscall_pre_aio_return(aiocbp)                             \
+  __sanitizer_syscall_pre_impl_aio_return((long long)(aiocbp))
+#define __sanitizer_syscall_post_aio_return(res, aiocbp)                       \
+  __sanitizer_syscall_post_impl_aio_return(res, (long long)(aiocbp))
+#define __sanitizer_syscall_pre_compat_50_aio_suspend(list, nent, timeout)     \
+  __sanitizer_syscall_pre_impl_compat_50_aio_suspend(                          \
+      (long long)(list), (long long)(nent), (long long)(timeout))
+#define __sanitizer_syscall_post_compat_50_aio_suspend(res, list, nent,        \
+                                                       timeout)                \
+  __sanitizer_syscall_post_impl_compat_50_aio_suspend(                         \
+      res, (long long)(list), (long long)(nent), (long long)(timeout))
+#define __sanitizer_syscall_pre_aio_write(aiocbp)                              \
+  __sanitizer_syscall_pre_impl_aio_write((long long)(aiocbp))
+#define __sanitizer_syscall_post_aio_write(res, aiocbp)                        \
+  __sanitizer_syscall_post_impl_aio_write(res, (long long)(aiocbp))
+#define __sanitizer_syscall_pre_lio_listio(mode, list, nent, sig)              \
+  __sanitizer_syscall_pre_impl_lio_listio((long long)(mode),                   \
+                                          (long long)(list),                   \
+                                          (long long)(nent), (long long)(sig))
+#define __sanitizer_syscall_post_lio_listio(res, mode, list, nent, sig)        \
+  __sanitizer_syscall_post_impl_lio_listio(                                    \
+      res, (long long)(mode), (long long)(list), (long long)(nent),            \
+      (long long)(sig))
+/* syscall 407 has been skipped */
+/* syscall 408 has been skipped */
+/* syscall 409 has been skipped */
+#define __sanitizer_syscall_pre___mount50(type, path, flags, data, data_len)   \
+  __sanitizer_syscall_pre_impl___mount50(                                      \
+      (long long)(type), (long long)(path), (long long)(flags),                \
+      (long long)(data), (long long)(data_len))
+#define __sanitizer_syscall_post___mount50(res, type, path, flags, data,       \
+                                           data_len)                           \
+  __sanitizer_syscall_post_impl___mount50(                                     \
+      res, (long long)(type), (long long)(path), (long long)(flags),           \
+      (long long)(data), (long long)(data_len))
+#define __sanitizer_syscall_pre_mremap(old_address, old_size, new_address,     \
+                                       new_size, flags)                        \
+  __sanitizer_syscall_pre_impl_mremap(                                         \
+      (long long)(old_address), (long long)(old_size),                         \
+      (long long)(new_address), (long long)(new_size), (long long)(flags))
+#define __sanitizer_syscall_post_mremap(res, old_address, old_size,            \
+                                        new_address, new_size, flags)          \
+  __sanitizer_syscall_post_impl_mremap(                                        \
+      res, (long long)(old_address), (long long)(old_size),                    \
+      (long long)(new_address), (long long)(new_size), (long long)(flags))
+#define __sanitizer_syscall_pre_pset_create(psid)                              \
+  __sanitizer_syscall_pre_impl_pset_create((long long)(psid))
+#define __sanitizer_syscall_post_pset_create(res, psid)                        \
+  __sanitizer_syscall_post_impl_pset_create(res, (long long)(psid))
+#define __sanitizer_syscall_pre_pset_destroy(psid)                             \
+  __sanitizer_syscall_pre_impl_pset_destroy((long long)(psid))
+#define __sanitizer_syscall_post_pset_destroy(res, psid)                       \
+  __sanitizer_syscall_post_impl_pset_destroy(res, (long long)(psid))
+#define __sanitizer_syscall_pre_pset_assign(psid, cpuid, opsid)                \
+  __sanitizer_syscall_pre_impl_pset_assign(                                    \
+      (long long)(psid), (long long)(cpuid), (long long)(opsid))
+#define __sanitizer_syscall_post_pset_assign(res, psid, cpuid, opsid)          \
+  __sanitizer_syscall_post_impl_pset_assign(                                   \
+      res, (long long)(psid), (long long)(cpuid), (long long)(opsid))
+#define __sanitizer_syscall_pre__pset_bind(idtype, first_id, second_id, psid,  \
+                                           opsid)                              \
+  __sanitizer_syscall_pre_impl__pset_bind(                                     \
+      (long long)(idtype), (long long)(first_id), (long long)(second_id),      \
+      (long long)(psid), (long long)(opsid))
+#define __sanitizer_syscall_post__pset_bind(res, idtype, first_id, second_id,  \
+                                            psid, opsid)                       \
+  __sanitizer_syscall_post_impl__pset_bind(                                    \
+      res, (long long)(idtype), (long long)(first_id), (long long)(second_id), \
+      (long long)(psid), (long long)(opsid))
+#define __sanitizer_syscall_pre___posix_fadvise50(fd, PAD, offset, len,        \
+                                                  advice)                      \
+  __sanitizer_syscall_pre_impl___posix_fadvise50(                              \
+      (long long)(fd), (long long)(PAD), (long long)(offset),                  \
+      (long long)(len), (long long)(advice))
+#define __sanitizer_syscall_post___posix_fadvise50(res, fd, PAD, offset, len,  \
+                                                   advice)                     \
+  __sanitizer_syscall_post_impl___posix_fadvise50(                             \
+      res, (long long)(fd), (long long)(PAD), (long long)(offset),             \
+      (long long)(len), (long long)(advice))
+#define __sanitizer_syscall_pre___select50(nd, in, ou, ex, tv)                 \
+  __sanitizer_syscall_pre_impl___select50((long long)(nd), (long long)(in),    \
+                                          (long long)(ou), (long long)(ex),    \
+                                          (long long)(tv))
+#define __sanitizer_syscall_post___select50(res, nd, in, ou, ex, tv)           \
+  __sanitizer_syscall_post_impl___select50(res, (long long)(nd),               \
+                                           (long long)(in), (long long)(ou),   \
+                                           (long long)(ex), (long long)(tv))
+#define __sanitizer_syscall_pre___gettimeofday50(tp, tzp)                      \
+  __sanitizer_syscall_pre_impl___gettimeofday50((long long)(tp),               \
+                                                (long long)(tzp))
+#define __sanitizer_syscall_post___gettimeofday50(res, tp, tzp)                \
+  __sanitizer_syscall_post_impl___gettimeofday50(res, (long long)(tp),         \
+                                                 (long long)(tzp))
+#define __sanitizer_syscall_pre___settimeofday50(tv, tzp)                      \
+  __sanitizer_syscall_pre_impl___settimeofday50((long long)(tv),               \
+                                                (long long)(tzp))
+#define __sanitizer_syscall_post___settimeofday50(res, tv, tzp)                \
+  __sanitizer_syscall_post_impl___settimeofday50(res, (long long)(tv),         \
+                                                 (long long)(tzp))
+#define __sanitizer_syscall_pre___utimes50(path, tptr)                         \
+  __sanitizer_syscall_pre_impl___utimes50((long long)(path), (long long)(tptr))
+#define __sanitizer_syscall_post___utimes50(res, path, tptr)                   \
+  __sanitizer_syscall_post_impl___utimes50(res, (long long)(path),             \
+                                           (long long)(tptr))
+#define __sanitizer_syscall_pre___adjtime50(delta, olddelta)                   \
+  __sanitizer_syscall_pre_impl___adjtime50((long long)(delta),                 \
+                                           (long long)(olddelta))
+#define __sanitizer_syscall_post___adjtime50(res, delta, olddelta)             \
+  __sanitizer_syscall_post_impl___adjtime50(res, (long long)(delta),           \
+                                            (long long)(olddelta))
+#define __sanitizer_syscall_pre___lfs_segwait50(fsidp, tv)                     \
+  __sanitizer_syscall_pre_impl___lfs_segwait50((long long)(fsidp),             \
+                                               (long long)(tv))
+#define __sanitizer_syscall_post___lfs_segwait50(res, fsidp, tv)               \
+  __sanitizer_syscall_post_impl___lfs_segwait50(res, (long long)(fsidp),       \
+                                                (long long)(tv))
+#define __sanitizer_syscall_pre___futimes50(fd, tptr)                          \
+  __sanitizer_syscall_pre_impl___futimes50((long long)(fd), (long long)(tptr))
+#define __sanitizer_syscall_post___futimes50(res, fd, tptr)                    \
+  __sanitizer_syscall_post_impl___futimes50(res, (long long)(fd),              \
+                                            (long long)(tptr))
+#define __sanitizer_syscall_pre___lutimes50(path, tptr)                        \
+  __sanitizer_syscall_pre_impl___lutimes50((long long)(path), (long long)(tptr))
+#define __sanitizer_syscall_post___lutimes50(res, path, tptr)                  \
+  __sanitizer_syscall_post_impl___lutimes50(res, (long long)(path),            \
+                                            (long long)(tptr))
+#define __sanitizer_syscall_pre___setitimer50(which, itv, oitv)                \
+  __sanitizer_syscall_pre_impl___setitimer50(                                  \
+      (long long)(which), (long long)(itv), (long long)(oitv))
+#define __sanitizer_syscall_post___setitimer50(res, which, itv, oitv)          \
+  __sanitizer_syscall_post_impl___setitimer50(                                 \
+      res, (long long)(which), (long long)(itv), (long long)(oitv))
+#define __sanitizer_syscall_pre___getitimer50(which, itv)                      \
+  __sanitizer_syscall_pre_impl___getitimer50((long long)(which),               \
+                                             (long long)(itv))
+#define __sanitizer_syscall_post___getitimer50(res, which, itv)                \
+  __sanitizer_syscall_post_impl___getitimer50(res, (long long)(which),         \
+                                              (long long)(itv))
+#define __sanitizer_syscall_pre___clock_gettime50(clock_id, tp)                \
+  __sanitizer_syscall_pre_impl___clock_gettime50((long long)(clock_id),        \
+                                                 (long long)(tp))
+#define __sanitizer_syscall_post___clock_gettime50(res, clock_id, tp)          \
+  __sanitizer_syscall_post_impl___clock_gettime50(res, (long long)(clock_id),  \
+                                                  (long long)(tp))
+#define __sanitizer_syscall_pre___clock_settime50(clock_id, tp)                \
+  __sanitizer_syscall_pre_impl___clock_settime50((long long)(clock_id),        \
+                                                 (long long)(tp))
+#define __sanitizer_syscall_post___clock_settime50(res, clock_id, tp)          \
+  __sanitizer_syscall_post_impl___clock_settime50(res, (long long)(clock_id),  \
+                                                  (long long)(tp))
+#define __sanitizer_syscall_pre___clock_getres50(clock_id, tp)                 \
+  __sanitizer_syscall_pre_impl___clock_getres50((long long)(clock_id),         \
+                                                (long long)(tp))
+#define __sanitizer_syscall_post___clock_getres50(res, clock_id, tp)           \
+  __sanitizer_syscall_post_impl___clock_getres50(res, (long long)(clock_id),   \
+                                                 (long long)(tp))
+#define __sanitizer_syscall_pre___nanosleep50(rqtp, rmtp)                      \
+  __sanitizer_syscall_pre_impl___nanosleep50((long long)(rqtp),                \
+                                             (long long)(rmtp))
+#define __sanitizer_syscall_post___nanosleep50(res, rqtp, rmtp)                \
+  __sanitizer_syscall_post_impl___nanosleep50(res, (long long)(rqtp),          \
+                                              (long long)(rmtp))
+#define __sanitizer_syscall_pre_____sigtimedwait50(set, info, timeout)         \
+  __sanitizer_syscall_pre_impl_____sigtimedwait50(                             \
+      (long long)(set), (long long)(info), (long long)(timeout))
+#define __sanitizer_syscall_post_____sigtimedwait50(res, set, info, timeout)   \
+  __sanitizer_syscall_post_impl_____sigtimedwait50(                            \
+      res, (long long)(set), (long long)(info), (long long)(timeout))
+#define __sanitizer_syscall_pre___mq_timedsend50(mqdes, msg_ptr, msg_len,      \
+                                                 msg_prio, abs_timeout)        \
+  __sanitizer_syscall_pre_impl___mq_timedsend50(                               \
+      (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),          \
+      (long long)(msg_prio), (long long)(abs_timeout))
+#define __sanitizer_syscall_post___mq_timedsend50(                             \
+    res, mqdes, msg_ptr, msg_len, msg_prio, abs_timeout)                       \
+  __sanitizer_syscall_post_impl___mq_timedsend50(                              \
+      res, (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),     \
+      (long long)(msg_prio), (long long)(abs_timeout))
+#define __sanitizer_syscall_pre___mq_timedreceive50(mqdes, msg_ptr, msg_len,   \
+                                                    msg_prio, abs_timeout)     \
+  __sanitizer_syscall_pre_impl___mq_timedreceive50(                            \
+      (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),          \
+      (long long)(msg_prio), (long long)(abs_timeout))
+#define __sanitizer_syscall_post___mq_timedreceive50(                          \
+    res, mqdes, msg_ptr, msg_len, msg_prio, abs_timeout)                       \
+  __sanitizer_syscall_post_impl___mq_timedreceive50(                           \
+      res, (long long)(mqdes), (long long)(msg_ptr), (long long)(msg_len),     \
+      (long long)(msg_prio), (long long)(abs_timeout))
+#define __sanitizer_syscall_pre_compat_60__lwp_park(ts, unpark, hint,          \
+                                                    unparkhint)                \
+  __sanitizer_syscall_pre_impl_compat_60__lwp_park(                            \
+      (long long)(ts), (long long)(unpark), (long long)(hint),                 \
+      (long long)(unparkhint))
+#define __sanitizer_syscall_post_compat_60__lwp_park(res, ts, unpark, hint,    \
+                                                     unparkhint)               \
+  __sanitizer_syscall_post_impl_compat_60__lwp_park(                           \
+      res, (long long)(ts), (long long)(unpark), (long long)(hint),            \
+      (long long)(unparkhint))
+#define __sanitizer_syscall_pre___kevent50(fd, changelist, nchanges,           \
+                                           eventlist, nevents, timeout)        \
+  __sanitizer_syscall_pre_impl___kevent50(                                     \
+      (long long)(fd), (long long)(changelist), (long long)(nchanges),         \
+      (long long)(eventlist), (long long)(nevents), (long long)(timeout))
+#define __sanitizer_syscall_post___kevent50(res, fd, changelist, nchanges,     \
+                                            eventlist, nevents, timeout)       \
+  __sanitizer_syscall_post_impl___kevent50(                                    \
+      res, (long long)(fd), (long long)(changelist), (long long)(nchanges),    \
+      (long long)(eventlist), (long long)(nevents), (long long)(timeout))
+#define __sanitizer_syscall_pre___pselect50(nd, in, ou, ex, ts, mask)          \
+  __sanitizer_syscall_pre_impl___pselect50((long long)(nd), (long long)(in),   \
+                                           (long long)(ou), (long long)(ex),   \
+                                           (long long)(ts), (long long)(mask))
+#define __sanitizer_syscall_post___pselect50(res, nd, in, ou, ex, ts, mask)    \
+  __sanitizer_syscall_post_impl___pselect50(                                   \
+      res, (long long)(nd), (long long)(in), (long long)(ou), (long long)(ex), \
+      (long long)(ts), (long long)(mask))
+#define __sanitizer_syscall_pre___pollts50(fds, nfds, ts, mask)                \
+  __sanitizer_syscall_pre_impl___pollts50((long long)(fds), (long long)(nfds), \
+                                          (long long)(ts), (long long)(mask))
+#define __sanitizer_syscall_post___pollts50(res, fds, nfds, ts, mask)          \
+  __sanitizer_syscall_post_impl___pollts50(res, (long long)(fds),              \
+                                           (long long)(nfds), (long long)(ts), \
+                                           (long long)(mask))
+#define __sanitizer_syscall_pre___aio_suspend50(list, nent, timeout)           \
+  __sanitizer_syscall_pre_impl___aio_suspend50(                                \
+      (long long)(list), (long long)(nent), (long long)(timeout))
+#define __sanitizer_syscall_post___aio_suspend50(res, list, nent, timeout)     \
+  __sanitizer_syscall_post_impl___aio_suspend50(                               \
+      res, (long long)(list), (long long)(nent), (long long)(timeout))
+#define __sanitizer_syscall_pre___stat50(path, ub)                             \
+  __sanitizer_syscall_pre_impl___stat50((long long)(path), (long long)(ub))
+#define __sanitizer_syscall_post___stat50(res, path, ub)                       \
+  __sanitizer_syscall_post_impl___stat50(res, (long long)(path),               \
+                                         (long long)(ub))
+#define __sanitizer_syscall_pre___fstat50(fd, sb)                              \
+  __sanitizer_syscall_pre_impl___fstat50((long long)(fd), (long long)(sb))
+#define __sanitizer_syscall_post___fstat50(res, fd, sb)                        \
+  __sanitizer_syscall_post_impl___fstat50(res, (long long)(fd), (long long)(sb))
+#define __sanitizer_syscall_pre___lstat50(path, ub)                            \
+  __sanitizer_syscall_pre_impl___lstat50((long long)(path), (long long)(ub))
+#define __sanitizer_syscall_post___lstat50(res, path, ub)                      \
+  __sanitizer_syscall_post_impl___lstat50(res, (long long)(path),              \
+                                          (long long)(ub))
+#define __sanitizer_syscall_pre_____semctl50(semid, semnum, cmd, arg)          \
+  __sanitizer_syscall_pre_impl_____semctl50(                                   \
+      (long long)(semid), (long long)(semnum), (long long)(cmd),               \
+      (long long)(arg))
+#define __sanitizer_syscall_post_____semctl50(res, semid, semnum, cmd, arg)    \
+  __sanitizer_syscall_post_impl_____semctl50(                                  \
+      res, (long long)(semid), (long long)(semnum), (long long)(cmd),          \
+      (long long)(arg))
+#define __sanitizer_syscall_pre___shmctl50(shmid, cmd, buf)                    \
+  __sanitizer_syscall_pre_impl___shmctl50((long long)(shmid),                  \
+                                          (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_post___shmctl50(res, shmid, cmd, buf)              \
+  __sanitizer_syscall_post_impl___shmctl50(res, (long long)(shmid),            \
+                                           (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_pre___msgctl50(msqid, cmd, buf)                    \
+  __sanitizer_syscall_pre_impl___msgctl50((long long)(msqid),                  \
+                                          (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_post___msgctl50(res, msqid, cmd, buf)              \
+  __sanitizer_syscall_post_impl___msgctl50(res, (long long)(msqid),            \
+                                           (long long)(cmd), (long long)(buf))
+#define __sanitizer_syscall_pre___getrusage50(who, rusage)                     \
+  __sanitizer_syscall_pre_impl___getrusage50((long long)(who),                 \
+                                             (long long)(rusage))
+#define __sanitizer_syscall_post___getrusage50(res, who, rusage)               \
+  __sanitizer_syscall_post_impl___getrusage50(res, (long long)(who),           \
+                                              (long long)(rusage))
+#define __sanitizer_syscall_pre___timer_settime50(timerid, flags, value,       \
+                                                  ovalue)                      \
+  __sanitizer_syscall_pre_impl___timer_settime50(                              \
+      (long long)(timerid), (long long)(flags), (long long)(value),            \
+      (long long)(ovalue))
+#define __sanitizer_syscall_post___timer_settime50(res, timerid, flags, value, \
+                                                   ovalue)                     \
+  __sanitizer_syscall_post_impl___timer_settime50(                             \
+      res, (long long)(timerid), (long long)(flags), (long long)(value),       \
+      (long long)(ovalue))
+#define __sanitizer_syscall_pre___timer_gettime50(timerid, value)              \
+  __sanitizer_syscall_pre_impl___timer_gettime50((long long)(timerid),         \
+                                                 (long long)(value))
+#define __sanitizer_syscall_post___timer_gettime50(res, timerid, value)        \
+  __sanitizer_syscall_post_impl___timer_gettime50(res, (long long)(timerid),   \
+                                                  (long long)(value))
+#if defined(NTP) || !defined(_KERNEL_OPT)
+#define __sanitizer_syscall_pre___ntp_gettime50(ntvp)                          \
+  __sanitizer_syscall_pre_impl___ntp_gettime50((long long)(ntvp))
+#define __sanitizer_syscall_post___ntp_gettime50(res, ntvp)                    \
+  __sanitizer_syscall_post_impl___ntp_gettime50(res, (long long)(ntvp))
+#else
+/* syscall 448 has been skipped */
+#endif
+#define __sanitizer_syscall_pre___wait450(pid, status, options, rusage)        \
+  __sanitizer_syscall_pre_impl___wait450(                                      \
+      (long long)(pid), (long long)(status), (long long)(options),             \
+      (long long)(rusage))
+#define __sanitizer_syscall_post___wait450(res, pid, status, options, rusage)  \
+  __sanitizer_syscall_post_impl___wait450(                                     \
+      res, (long long)(pid), (long long)(status), (long long)(options),        \
+      (long long)(rusage))
+#define __sanitizer_syscall_pre___mknod50(path, mode, dev)                     \
+  __sanitizer_syscall_pre_impl___mknod50((long long)(path), (long long)(mode), \
+                                         (long long)(dev))
+#define __sanitizer_syscall_post___mknod50(res, path, mode, dev)               \
+  __sanitizer_syscall_post_impl___mknod50(res, (long long)(path),              \
+                                          (long long)(mode), (long long)(dev))
+#define __sanitizer_syscall_pre___fhstat50(fhp, fh_size, sb)                   \
+  __sanitizer_syscall_pre_impl___fhstat50(                                     \
+      (long long)(fhp), (long long)(fh_size), (long long)(sb))
+#define __sanitizer_syscall_post___fhstat50(res, fhp, fh_size, sb)             \
+  __sanitizer_syscall_post_impl___fhstat50(                                    \
+      res, (long long)(fhp), (long long)(fh_size), (long long)(sb))
+/* syscall 452 has been skipped */
+#define __sanitizer_syscall_pre_pipe2(fildes, flags)                           \
+  __sanitizer_syscall_pre_impl_pipe2((long long)(fildes), (long long)(flags))
+#define __sanitizer_syscall_post_pipe2(res, fildes, flags)                     \
+  __sanitizer_syscall_post_impl_pipe2(res, (long long)(fildes),                \
+                                      (long long)(flags))
+#define __sanitizer_syscall_pre_dup3(from, to, flags)                          \
+  __sanitizer_syscall_pre_impl_dup3((long long)(from), (long long)(to),        \
+                                    (long long)(flags))
+#define __sanitizer_syscall_post_dup3(res, from, to, flags)                    \
+  __sanitizer_syscall_post_impl_dup3(res, (long long)(from), (long long)(to),  \
+                                     (long long)(flags))
+#define __sanitizer_syscall_pre_kqueue1(flags)                                 \
+  __sanitizer_syscall_pre_impl_kqueue1((long long)(flags))
+#define __sanitizer_syscall_post_kqueue1(res, flags)                           \
+  __sanitizer_syscall_post_impl_kqueue1(res, (long long)(flags))
+#define __sanitizer_syscall_pre_paccept(s, name, anamelen, mask, flags)        \
+  __sanitizer_syscall_pre_impl_paccept((long long)(s), (long long)(name),      \
+                                       (long long)(anamelen),                  \
+                                       (long long)(mask), (long long)(flags))
+#define __sanitizer_syscall_post_paccept(res, s, name, anamelen, mask, flags)  \
+  __sanitizer_syscall_post_impl_paccept(                                       \
+      res, (long long)(s), (long long)(name), (long long)(anamelen),           \
+      (long long)(mask), (long long)(flags))
+#define __sanitizer_syscall_pre_linkat(fd1, name1, fd2, name2, flags)          \
+  __sanitizer_syscall_pre_impl_linkat((long long)(fd1), (long long)(name1),    \
+                                      (long long)(fd2), (long long)(name2),    \
+                                      (long long)(flags))
+#define __sanitizer_syscall_post_linkat(res, fd1, name1, fd2, name2, flags)    \
+  __sanitizer_syscall_post_impl_linkat(res, (long long)(fd1),                  \
+                                       (long long)(name1), (long long)(fd2),   \
+                                       (long long)(name2), (long long)(flags))
+#define __sanitizer_syscall_pre_renameat(fromfd, from, tofd, to)               \
+  __sanitizer_syscall_pre_impl_renameat((long long)(fromfd),                   \
+                                        (long long)(from), (long long)(tofd),  \
+                                        (long long)(to))
+#define __sanitizer_syscall_post_renameat(res, fromfd, from, tofd, to)         \
+  __sanitizer_syscall_post_impl_renameat(res, (long long)(fromfd),             \
+                                         (long long)(from), (long long)(tofd), \
+                                         (long long)(to))
+#define __sanitizer_syscall_pre_mkfifoat(fd, path, mode)                       \
+  __sanitizer_syscall_pre_impl_mkfifoat((long long)(fd), (long long)(path),    \
+                                        (long long)(mode))
+#define __sanitizer_syscall_post_mkfifoat(res, fd, path, mode)                 \
+  __sanitizer_syscall_post_impl_mkfifoat(res, (long long)(fd),                 \
+                                         (long long)(path), (long long)(mode))
+#define __sanitizer_syscall_pre_mknodat(fd, path, mode, PAD, dev)              \
+  __sanitizer_syscall_pre_impl_mknodat((long long)(fd), (long long)(path),     \
+                                       (long long)(mode), (long long)(PAD),    \
+                                       (long long)(dev))
+#define __sanitizer_syscall_post_mknodat(res, fd, path, mode, PAD, dev)        \
+  __sanitizer_syscall_post_impl_mknodat(res, (long long)(fd),                  \
+                                        (long long)(path), (long long)(mode),  \
+                                        (long long)(PAD), (long long)(dev))
+#define __sanitizer_syscall_pre_mkdirat(fd, path, mode)                        \
+  __sanitizer_syscall_pre_impl_mkdirat((long long)(fd), (long long)(path),     \
+                                       (long long)(mode))
+#define __sanitizer_syscall_post_mkdirat(res, fd, path, mode)                  \
+  __sanitizer_syscall_post_impl_mkdirat(res, (long long)(fd),                  \
+                                        (long long)(path), (long long)(mode))
+#define __sanitizer_syscall_pre_faccessat(fd, path, amode, flag)               \
+  __sanitizer_syscall_pre_impl_faccessat((long long)(fd), (long long)(path),   \
+                                         (long long)(amode),                   \
+                                         (long long)(flag))
+#define __sanitizer_syscall_post_faccessat(res, fd, path, amode, flag)         \
+  __sanitizer_syscall_post_impl_faccessat(                                     \
+      res, (long long)(fd), (long long)(path), (long long)(amode),             \
+      (long long)(flag))
+#define __sanitizer_syscall_pre_fchmodat(fd, path, mode, flag)                 \
+  __sanitizer_syscall_pre_impl_fchmodat((long long)(fd), (long long)(path),    \
+                                        (long long)(mode), (long long)(flag))
+#define __sanitizer_syscall_post_fchmodat(res, fd, path, mode, flag)           \
+  __sanitizer_syscall_post_impl_fchmodat(res, (long long)(fd),                 \
+                                         (long long)(path), (long long)(mode), \
+                                         (long long)(flag))
+#define __sanitizer_syscall_pre_fchownat(fd, path, owner, group, flag)         \
+  __sanitizer_syscall_pre_impl_fchownat((long long)(fd), (long long)(path),    \
+                                        (long long)(owner),                    \
+                                        (long long)(group), (long long)(flag))
+#define __sanitizer_syscall_post_fchownat(res, fd, path, owner, group, flag)   \
+  __sanitizer_syscall_post_impl_fchownat(                                      \
+      res, (long long)(fd), (long long)(path), (long long)(owner),             \
+      (long long)(group), (long long)(flag))
+#define __sanitizer_syscall_pre_fexecve(fd, argp, envp)                        \
+  __sanitizer_syscall_pre_impl_fexecve((long long)(fd), (long long)(argp),     \
+                                       (long long)(envp))
+#define __sanitizer_syscall_post_fexecve(res, fd, argp, envp)                  \
+  __sanitizer_syscall_post_impl_fexecve(res, (long long)(fd),                  \
+                                        (long long)(argp), (long long)(envp))
+#define __sanitizer_syscall_pre_fstatat(fd, path, buf, flag)                   \
+  __sanitizer_syscall_pre_impl_fstatat((long long)(fd), (long long)(path),     \
+                                       (long long)(buf), (long long)(flag))
+#define __sanitizer_syscall_post_fstatat(res, fd, path, buf, flag)             \
+  __sanitizer_syscall_post_impl_fstatat(res, (long long)(fd),                  \
+                                        (long long)(path), (long long)(buf),   \
+                                        (long long)(flag))
+#define __sanitizer_syscall_pre_utimensat(fd, path, tptr, flag)                \
+  __sanitizer_syscall_pre_impl_utimensat((long long)(fd), (long long)(path),   \
+                                         (long long)(tptr), (long long)(flag))
+#define __sanitizer_syscall_post_utimensat(res, fd, path, tptr, flag)          \
+  __sanitizer_syscall_post_impl_utimensat(                                     \
+      res, (long long)(fd), (long long)(path), (long long)(tptr),              \
+      (long long)(flag))
+#define __sanitizer_syscall_pre_openat(fd, path, oflags, mode)                 \
+  __sanitizer_syscall_pre_impl_openat((long long)(fd), (long long)(path),      \
+                                      (long long)(oflags), (long long)(mode))
+#define __sanitizer_syscall_post_openat(res, fd, path, oflags, mode)           \
+  __sanitizer_syscall_post_impl_openat(res, (long long)(fd),                   \
+                                       (long long)(path), (long long)(oflags), \
+                                       (long long)(mode))
+#define __sanitizer_syscall_pre_readlinkat(fd, path, buf, bufsize)             \
+  __sanitizer_syscall_pre_impl_readlinkat((long long)(fd), (long long)(path),  \
+                                          (long long)(buf),                    \
+                                          (long long)(bufsize))
+#define __sanitizer_syscall_post_readlinkat(res, fd, path, buf, bufsize)       \
+  __sanitizer_syscall_post_impl_readlinkat(                                    \
+      res, (long long)(fd), (long long)(path), (long long)(buf),               \
+      (long long)(bufsize))
+#define __sanitizer_syscall_pre_symlinkat(path1, fd, path2)                    \
+  __sanitizer_syscall_pre_impl_symlinkat((long long)(path1), (long long)(fd),  \
+                                         (long long)(path2))
+#define __sanitizer_syscall_post_symlinkat(res, path1, fd, path2)              \
+  __sanitizer_syscall_post_impl_symlinkat(res, (long long)(path1),             \
+                                          (long long)(fd), (long long)(path2))
+#define __sanitizer_syscall_pre_unlinkat(fd, path, flag)                       \
+  __sanitizer_syscall_pre_impl_unlinkat((long long)(fd), (long long)(path),    \
+                                        (long long)(flag))
+#define __sanitizer_syscall_post_unlinkat(res, fd, path, flag)                 \
+  __sanitizer_syscall_post_impl_unlinkat(res, (long long)(fd),                 \
+                                         (long long)(path), (long long)(flag))
+#define __sanitizer_syscall_pre_futimens(fd, tptr)                             \
+  __sanitizer_syscall_pre_impl_futimens((long long)(fd), (long long)(tptr))
+#define __sanitizer_syscall_post_futimens(res, fd, tptr)                       \
+  __sanitizer_syscall_post_impl_futimens(res, (long long)(fd),                 \
+                                         (long long)(tptr))
+#define __sanitizer_syscall_pre___quotactl(path, args)                         \
+  __sanitizer_syscall_pre_impl___quotactl((long long)(path), (long long)(args))
+#define __sanitizer_syscall_post___quotactl(res, path, args)                   \
+  __sanitizer_syscall_post_impl___quotactl(res, (long long)(path),             \
+                                           (long long)(args))
+#define __sanitizer_syscall_pre_posix_spawn(pid, path, file_actions, attrp,    \
+                                            argv, envp)                        \
+  __sanitizer_syscall_pre_impl_posix_spawn(                                    \
+      (long long)(pid), (long long)(path), (long long)(file_actions),          \
+      (long long)(attrp), (long long)(argv), (long long)(envp))
+#define __sanitizer_syscall_post_posix_spawn(res, pid, path, file_actions,     \
+                                             attrp, argv, envp)                \
+  __sanitizer_syscall_post_impl_posix_spawn(                                   \
+      res, (long long)(pid), (long long)(path), (long long)(file_actions),     \
+      (long long)(attrp), (long long)(argv), (long long)(envp))
+#define __sanitizer_syscall_pre_recvmmsg(s, mmsg, vlen, flags, timeout)        \
+  __sanitizer_syscall_pre_impl_recvmmsg((long long)(s), (long long)(mmsg),     \
+                                        (long long)(vlen), (long long)(flags), \
+                                        (long long)(timeout))
+#define __sanitizer_syscall_post_recvmmsg(res, s, mmsg, vlen, flags, timeout)  \
+  __sanitizer_syscall_post_impl_recvmmsg(                                      \
+      res, (long long)(s), (long long)(mmsg), (long long)(vlen),               \
+      (long long)(flags), (long long)(timeout))
+#define __sanitizer_syscall_pre_sendmmsg(s, mmsg, vlen, flags)                 \
+  __sanitizer_syscall_pre_impl_sendmmsg((long long)(s), (long long)(mmsg),     \
+                                        (long long)(vlen), (long long)(flags))
+#define __sanitizer_syscall_post_sendmmsg(res, s, mmsg, vlen, flags)           \
+  __sanitizer_syscall_post_impl_sendmmsg(res, (long long)(s),                  \
+                                         (long long)(mmsg), (long long)(vlen), \
+                                         (long long)(flags))
+#define __sanitizer_syscall_pre_clock_nanosleep(clock_id, flags, rqtp, rmtp)   \
+  __sanitizer_syscall_pre_impl_clock_nanosleep(                                \
+      (long long)(clock_id), (long long)(flags), (long long)(rqtp),            \
+      (long long)(rmtp))
+#define __sanitizer_syscall_post_clock_nanosleep(res, clock_id, flags, rqtp,   \
+                                                 rmtp)                         \
+  __sanitizer_syscall_post_impl_clock_nanosleep(                               \
+      res, (long long)(clock_id), (long long)(flags), (long long)(rqtp),       \
+      (long long)(rmtp))
+#define __sanitizer_syscall_pre____lwp_park60(clock_id, flags, ts, unpark,     \
+                                              hint, unparkhint)                \
+  __sanitizer_syscall_pre_impl____lwp_park60(                                  \
+      (long long)(clock_id), (long long)(flags), (long long)(ts),              \
+      (long long)(unpark), (long long)(hint), (long long)(unparkhint))
+#define __sanitizer_syscall_post____lwp_park60(res, clock_id, flags, ts,       \
+                                               unpark, hint, unparkhint)       \
+  __sanitizer_syscall_post_impl____lwp_park60(                                 \
+      res, (long long)(clock_id), (long long)(flags), (long long)(ts),         \
+      (long long)(unpark), (long long)(hint), (long long)(unparkhint))
+#define __sanitizer_syscall_pre_posix_fallocate(fd, PAD, pos, len)             \
+  __sanitizer_syscall_pre_impl_posix_fallocate(                                \
+      (long long)(fd), (long long)(PAD), (long long)(pos), (long long)(len))
+#define __sanitizer_syscall_post_posix_fallocate(res, fd, PAD, pos, len)       \
+  __sanitizer_syscall_post_impl_posix_fallocate(                               \
+      res, (long long)(fd), (long long)(PAD), (long long)(pos),                \
+      (long long)(len))
+#define __sanitizer_syscall_pre_fdiscard(fd, PAD, pos, len)                    \
+  __sanitizer_syscall_pre_impl_fdiscard((long long)(fd), (long long)(PAD),     \
+                                        (long long)(pos), (long long)(len))
+#define __sanitizer_syscall_post_fdiscard(res, fd, PAD, pos, len)              \
+  __sanitizer_syscall_post_impl_fdiscard(res, (long long)(fd),                 \
+                                         (long long)(PAD), (long long)(pos),   \
+                                         (long long)(len))
+#define __sanitizer_syscall_pre_wait6(idtype, id, status, options, wru, info)  \
+  __sanitizer_syscall_pre_impl_wait6(                                          \
+      (long long)(idtype), (long long)(id), (long long)(status),               \
+      (long long)(options), (long long)(wru), (long long)(info))
+#define __sanitizer_syscall_post_wait6(res, idtype, id, status, options, wru,  \
+                                       info)                                   \
+  __sanitizer_syscall_post_impl_wait6(                                         \
+      res, (long long)(idtype), (long long)(id), (long long)(status),          \
+      (long long)(options), (long long)(wru), (long long)(info))
+#define __sanitizer_syscall_pre_clock_getcpuclockid2(idtype, id, clock_id)     \
+  __sanitizer_syscall_pre_impl_clock_getcpuclockid2(                           \
+      (long long)(idtype), (long long)(id), (long long)(clock_id))
+#define __sanitizer_syscall_post_clock_getcpuclockid2(res, idtype, id,         \
+                                                      clock_id)                \
+  __sanitizer_syscall_post_impl_clock_getcpuclockid2(                          \
+      res, (long long)(idtype), (long long)(id), (long long)(clock_id))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Private declarations. Do not call directly from user code. Use macros above.
+
+// DO NOT EDIT! THIS FILE HAS BEEN GENERATED!
+
+void __sanitizer_syscall_pre_impl_syscall(long long code, long long arg0,
+                                          long long arg1, long long arg2,
+                                          long long arg3, long long arg4,
+                                          long long arg5, long long arg6,
+                                          long long arg7);
+void __sanitizer_syscall_post_impl_syscall(long long res, long long code,
+                                           long long arg0, long long arg1,
+                                           long long arg2, long long arg3,
+                                           long long arg4, long long arg5,
+                                           long long arg6, long long arg7);
+void __sanitizer_syscall_pre_impl_exit(long long rval);
+void __sanitizer_syscall_post_impl_exit(long long res, long long rval);
+void __sanitizer_syscall_pre_impl_fork(void);
+void __sanitizer_syscall_post_impl_fork(long long res);
+void __sanitizer_syscall_pre_impl_read(long long fd, long long buf,
+                                       long long nbyte);
+void __sanitizer_syscall_post_impl_read(long long res, long long fd,
+                                        long long buf, long long nbyte);
+void __sanitizer_syscall_pre_impl_write(long long fd, long long buf,
+                                        long long nbyte);
+void __sanitizer_syscall_post_impl_write(long long res, long long fd,
+                                         long long buf, long long nbyte);
+void __sanitizer_syscall_pre_impl_open(long long path, long long flags,
+                                       long long mode);
+void __sanitizer_syscall_post_impl_open(long long res, long long path,
+                                        long long flags, long long mode);
+void __sanitizer_syscall_pre_impl_close(long long fd);
+void __sanitizer_syscall_post_impl_close(long long res, long long fd);
+void __sanitizer_syscall_pre_impl_compat_50_wait4(long long pid,
+                                                  long long status,
+                                                  long long options,
+                                                  long long rusage);
+void __sanitizer_syscall_post_impl_compat_50_wait4(long long res, long long pid,
+                                                   long long status,
+                                                   long long options,
+                                                   long long rusage);
+void __sanitizer_syscall_pre_impl_compat_43_ocreat(long long path,
+                                                   long long mode);
+void __sanitizer_syscall_post_impl_compat_43_ocreat(long long res,
+                                                    long long path,
+                                                    long long mode);
+void __sanitizer_syscall_pre_impl_link(long long path, long long link);
+void __sanitizer_syscall_post_impl_link(long long res, long long path,
+                                        long long link);
+void __sanitizer_syscall_pre_impl_unlink(long long path);
+void __sanitizer_syscall_post_impl_unlink(long long res, long long path);
+/* syscall 11 has been skipped */
+void __sanitizer_syscall_pre_impl_chdir(long long path);
+void __sanitizer_syscall_post_impl_chdir(long long res, long long path);
+void __sanitizer_syscall_pre_impl_fchdir(long long fd);
+void __sanitizer_syscall_post_impl_fchdir(long long res, long long fd);
+void __sanitizer_syscall_pre_impl_compat_50_mknod(long long path,
+                                                  long long mode,
+                                                  long long dev);
+void __sanitizer_syscall_post_impl_compat_50_mknod(long long res,
+                                                   long long path,
+                                                   long long mode,
+                                                   long long dev);
+void __sanitizer_syscall_pre_impl_chmod(long long path, long long mode);
+void __sanitizer_syscall_post_impl_chmod(long long res, long long path,
+                                         long long mode);
+void __sanitizer_syscall_pre_impl_chown(long long path, long long uid,
+                                        long long gid);
+void __sanitizer_syscall_post_impl_chown(long long res, long long path,
+                                         long long uid, long long gid);
+void __sanitizer_syscall_pre_impl_break(long long nsize);
+void __sanitizer_syscall_post_impl_break(long long res, long long nsize);
+void __sanitizer_syscall_pre_impl_compat_20_getfsstat(long long buf,
+                                                      long long bufsize,
+                                                      long long flags);
+void __sanitizer_syscall_post_impl_compat_20_getfsstat(long long res,
+                                                       long long buf,
+                                                       long long bufsize,
+                                                       long long flags);
+void __sanitizer_syscall_pre_impl_compat_43_olseek(long long fd,
+                                                   long long offset,
+                                                   long long whence);
+void __sanitizer_syscall_post_impl_compat_43_olseek(long long res, long long fd,
+                                                    long long offset,
+                                                    long long whence);
+void __sanitizer_syscall_pre_impl_getpid(void);
+void __sanitizer_syscall_post_impl_getpid(long long res);
+void __sanitizer_syscall_pre_impl_compat_40_mount(long long type,
+                                                  long long path,
+                                                  long long flags,
+                                                  long long data);
+void __sanitizer_syscall_post_impl_compat_40_mount(long long res,
+                                                   long long type,
+                                                   long long path,
+                                                   long long flags,
+                                                   long long data);
+void __sanitizer_syscall_pre_impl_unmount(long long path, long long flags);
+void __sanitizer_syscall_post_impl_unmount(long long res, long long path,
+                                           long long flags);
+void __sanitizer_syscall_pre_impl_setuid(long long uid);
+void __sanitizer_syscall_post_impl_setuid(long long res, long long uid);
+void __sanitizer_syscall_pre_impl_getuid(void);
+void __sanitizer_syscall_post_impl_getuid(long long res);
+void __sanitizer_syscall_pre_impl_geteuid(void);
+void __sanitizer_syscall_post_impl_geteuid(long long res);
+void __sanitizer_syscall_pre_impl_ptrace(long long req, long long pid,
+                                         long long addr, long long data);
+void __sanitizer_syscall_post_impl_ptrace(long long res, long long req,
+                                          long long pid, long long addr,
+                                          long long data);
+void __sanitizer_syscall_pre_impl_recvmsg(long long s, long long msg,
+                                          long long flags);
+void __sanitizer_syscall_post_impl_recvmsg(long long res, long long s,
+                                           long long msg, long long flags);
+void __sanitizer_syscall_pre_impl_sendmsg(long long s, long long msg,
+                                          long long flags);
+void __sanitizer_syscall_post_impl_sendmsg(long long res, long long s,
+                                           long long msg, long long flags);
+void __sanitizer_syscall_pre_impl_recvfrom(long long s, long long buf,
+                                           long long len, long long flags,
+                                           long long from,
+                                           long long fromlenaddr);
+void __sanitizer_syscall_post_impl_recvfrom(long long res, long long s,
+                                            long long buf, long long len,
+                                            long long flags, long long from,
+                                            long long fromlenaddr);
+void __sanitizer_syscall_pre_impl_accept(long long s, long long name,
+                                         long long anamelen);
+void __sanitizer_syscall_post_impl_accept(long long res, long long s,
+                                          long long name, long long anamelen);
+void __sanitizer_syscall_pre_impl_getpeername(long long fdes, long long asa,
+                                              long long alen);
+void __sanitizer_syscall_post_impl_getpeername(long long res, long long fdes,
+                                               long long asa, long long alen);
+void __sanitizer_syscall_pre_impl_getsockname(long long fdes, long long asa,
+                                              long long alen);
+void __sanitizer_syscall_post_impl_getsockname(long long res, long long fdes,
+                                               long long asa, long long alen);
+void __sanitizer_syscall_pre_impl_access(long long path, long long flags);
+void __sanitizer_syscall_post_impl_access(long long res, long long path,
+                                          long long flags);
+void __sanitizer_syscall_pre_impl_chflags(long long path, long long flags);
+void __sanitizer_syscall_post_impl_chflags(long long res, long long path,
+                                           long long flags);
+void __sanitizer_syscall_pre_impl_fchflags(long long fd, long long flags);
+void __sanitizer_syscall_post_impl_fchflags(long long res, long long fd,
+                                            long long flags);
+void __sanitizer_syscall_pre_impl_sync(void);
+void __sanitizer_syscall_post_impl_sync(long long res);
+void __sanitizer_syscall_pre_impl_kill(long long pid, long long signum);
+void __sanitizer_syscall_post_impl_kill(long long res, long long pid,
+                                        long long signum);
+void __sanitizer_syscall_pre_impl_compat_43_stat43(long long path,
+                                                   long long ub);
+void __sanitizer_syscall_post_impl_compat_43_stat43(long long res,
+                                                    long long path,
+                                                    long long ub);
+void __sanitizer_syscall_pre_impl_getppid(void);
+void __sanitizer_syscall_post_impl_getppid(long long res);
+void __sanitizer_syscall_pre_impl_compat_43_lstat43(long long path,
+                                                    long long ub);
+void __sanitizer_syscall_post_impl_compat_43_lstat43(long long res,
+                                                     long long path,
+                                                     long long ub);
+void __sanitizer_syscall_pre_impl_dup(long long fd);
+void __sanitizer_syscall_post_impl_dup(long long res, long long fd);
+void __sanitizer_syscall_pre_impl_pipe(void);
+void __sanitizer_syscall_post_impl_pipe(long long res);
+void __sanitizer_syscall_pre_impl_getegid(void);
+void __sanitizer_syscall_post_impl_getegid(long long res);
+void __sanitizer_syscall_pre_impl_profil(long long samples, long long size,
+                                         long long offset, long long scale);
+void __sanitizer_syscall_post_impl_profil(long long res, long long samples,
+                                          long long size, long long offset,
+                                          long long scale);
+void __sanitizer_syscall_pre_impl_ktrace(long long fname, long long ops,
+                                         long long facs, long long pid);
+void __sanitizer_syscall_post_impl_ktrace(long long res, long long fname,
+                                          long long ops, long long facs,
+                                          long long pid);
+void __sanitizer_syscall_pre_impl_compat_13_sigaction13(long long signum,
+                                                        long long nsa,
+                                                        long long osa);
+void __sanitizer_syscall_post_impl_compat_13_sigaction13(long long res,
+                                                         long long signum,
+                                                         long long nsa,
+                                                         long long osa);
+void __sanitizer_syscall_pre_impl_getgid(void);
+void __sanitizer_syscall_post_impl_getgid(long long res);
+void __sanitizer_syscall_pre_impl_compat_13_sigprocmask13(long long how,
+                                                          long long mask);
+void __sanitizer_syscall_post_impl_compat_13_sigprocmask13(long long res,
+                                                           long long how,
+                                                           long long mask);
+void __sanitizer_syscall_pre_impl___getlogin(long long namebuf,
+                                             long long namelen);
+void __sanitizer_syscall_post_impl___getlogin(long long res, long long namebuf,
+                                              long long namelen);
+void __sanitizer_syscall_pre_impl___setlogin(long long namebuf);
+void __sanitizer_syscall_post_impl___setlogin(long long res, long long namebuf);
+void __sanitizer_syscall_pre_impl_acct(long long path);
+void __sanitizer_syscall_post_impl_acct(long long res, long long path);
+void __sanitizer_syscall_pre_impl_compat_13_sigpending13(void);
+void __sanitizer_syscall_post_impl_compat_13_sigpending13(long long res);
+void __sanitizer_syscall_pre_impl_compat_13_sigaltstack13(long long nss,
+                                                          long long oss);
+void __sanitizer_syscall_post_impl_compat_13_sigaltstack13(long long res,
+                                                           long long nss,
+                                                           long long oss);
+void __sanitizer_syscall_pre_impl_ioctl(long long fd, long long com,
+                                        long long data);
+void __sanitizer_syscall_post_impl_ioctl(long long res, long long fd,
+                                         long long com, long long data);
+void __sanitizer_syscall_pre_impl_compat_12_oreboot(long long opt);
+void __sanitizer_syscall_post_impl_compat_12_oreboot(long long res,
+                                                     long long opt);
+void __sanitizer_syscall_pre_impl_revoke(long long path);
+void __sanitizer_syscall_post_impl_revoke(long long res, long long path);
+void __sanitizer_syscall_pre_impl_symlink(long long path, long long link);
+void __sanitizer_syscall_post_impl_symlink(long long res, long long path,
+                                           long long link);
+void __sanitizer_syscall_pre_impl_readlink(long long path, long long buf,
+                                           long long count);
+void __sanitizer_syscall_post_impl_readlink(long long res, long long path,
+                                            long long buf, long long count);
+void __sanitizer_syscall_pre_impl_execve(long long path, long long argp,
+                                         long long envp);
+void __sanitizer_syscall_post_impl_execve(long long res, long long path,
+                                          long long argp, long long envp);
+void __sanitizer_syscall_pre_impl_umask(long long newmask);
+void __sanitizer_syscall_post_impl_umask(long long res, long long newmask);
+void __sanitizer_syscall_pre_impl_chroot(long long path);
+void __sanitizer_syscall_post_impl_chroot(long long res, long long path);
+void __sanitizer_syscall_pre_impl_compat_43_fstat43(long long fd, long long sb);
+void __sanitizer_syscall_post_impl_compat_43_fstat43(long long res,
+                                                     long long fd,
+                                                     long long sb);
+void __sanitizer_syscall_pre_impl_compat_43_ogetkerninfo(long long op,
+                                                         long long where,
+                                                         long long size,
+                                                         long long arg);
+void __sanitizer_syscall_post_impl_compat_43_ogetkerninfo(long long res,
+                                                          long long op,
+                                                          long long where,
+                                                          long long size,
+                                                          long long arg);
+void __sanitizer_syscall_pre_impl_compat_43_ogetpagesize(void);
+void __sanitizer_syscall_post_impl_compat_43_ogetpagesize(long long res);
+void __sanitizer_syscall_pre_impl_compat_12_msync(long long addr,
+                                                  long long len);
+void __sanitizer_syscall_post_impl_compat_12_msync(long long res,
+                                                   long long addr,
+                                                   long long len);
+void __sanitizer_syscall_pre_impl_vfork(void);
+void __sanitizer_syscall_post_impl_vfork(long long res);
+/* syscall 67 has been skipped */
+/* syscall 68 has been skipped */
+/* syscall 69 has been skipped */
+/* syscall 70 has been skipped */
+void __sanitizer_syscall_pre_impl_compat_43_ommap(long long addr, long long len,
+                                                  long long prot,
+                                                  long long flags, long long fd,
+                                                  long long pos);
+void __sanitizer_syscall_post_impl_compat_43_ommap(
+    long long res, long long addr, long long len, long long prot,
+    long long flags, long long fd, long long pos);
+void __sanitizer_syscall_pre_impl_vadvise(long long anom);
+void __sanitizer_syscall_post_impl_vadvise(long long res, long long anom);
+void __sanitizer_syscall_pre_impl_munmap(long long addr, long long len);
+void __sanitizer_syscall_post_impl_munmap(long long res, long long addr,
+                                          long long len);
+void __sanitizer_syscall_pre_impl_mprotect(long long addr, long long len,
+                                           long long prot);
+void __sanitizer_syscall_post_impl_mprotect(long long res, long long addr,
+                                            long long len, long long prot);
+void __sanitizer_syscall_pre_impl_madvise(long long addr, long long len,
+                                          long long behav);
+void __sanitizer_syscall_post_impl_madvise(long long res, long long addr,
+                                           long long len, long long behav);
+/* syscall 76 has been skipped */
+/* syscall 77 has been skipped */
+void __sanitizer_syscall_pre_impl_mincore(long long addr, long long len,
+                                          long long vec);
+void __sanitizer_syscall_post_impl_mincore(long long res, long long addr,
+                                           long long len, long long vec);
+void __sanitizer_syscall_pre_impl_getgroups(long long gidsetsize,
+                                            long long gidset);
+void __sanitizer_syscall_post_impl_getgroups(long long res,
+                                             long long gidsetsize,
+                                             long long gidset);
+void __sanitizer_syscall_pre_impl_setgroups(long long gidsetsize,
+                                            long long gidset);
+void __sanitizer_syscall_post_impl_setgroups(long long res,
+                                             long long gidsetsize,
+                                             long long gidset);
+void __sanitizer_syscall_pre_impl_getpgrp(void);
+void __sanitizer_syscall_post_impl_getpgrp(long long res);
+void __sanitizer_syscall_pre_impl_setpgid(long long pid, long long pgid);
+void __sanitizer_syscall_post_impl_setpgid(long long res, long long pid,
+                                           long long pgid);
+void __sanitizer_syscall_pre_impl_compat_50_setitimer(long long which,
+                                                      long long itv,
+                                                      long long oitv);
+void __sanitizer_syscall_post_impl_compat_50_setitimer(long long res,
+                                                       long long which,
+                                                       long long itv,
+                                                       long long oitv);
+void __sanitizer_syscall_pre_impl_compat_43_owait(void);
+void __sanitizer_syscall_post_impl_compat_43_owait(long long res);
+void __sanitizer_syscall_pre_impl_compat_12_oswapon(long long name);
+void __sanitizer_syscall_post_impl_compat_12_oswapon(long long res,
+                                                     long long name);
+void __sanitizer_syscall_pre_impl_compat_50_getitimer(long long which,
+                                                      long long itv);
+void __sanitizer_syscall_post_impl_compat_50_getitimer(long long res,
+                                                       long long which,
+                                                       long long itv);
+void __sanitizer_syscall_pre_impl_compat_43_ogethostname(long long hostname,
+                                                         long long len);
+void __sanitizer_syscall_post_impl_compat_43_ogethostname(long long res,
+                                                          long long hostname,
+                                                          long long len);
+void __sanitizer_syscall_pre_impl_compat_43_osethostname(long long hostname,
+                                                         long long len);
+void __sanitizer_syscall_post_impl_compat_43_osethostname(long long res,
+                                                          long long hostname,
+                                                          long long len);
+void __sanitizer_syscall_pre_impl_compat_43_ogetdtablesize(void);
+void __sanitizer_syscall_post_impl_compat_43_ogetdtablesize(long long res);
+void __sanitizer_syscall_pre_impl_dup2(long long from, long long to);
+void __sanitizer_syscall_post_impl_dup2(long long res, long long from,
+                                        long long to);
+/* syscall 91 has been skipped */
+void __sanitizer_syscall_pre_impl_fcntl(long long fd, long long cmd,
+                                        long long arg);
+void __sanitizer_syscall_post_impl_fcntl(long long res, long long fd,
+                                         long long cmd, long long arg);
+void __sanitizer_syscall_pre_impl_compat_50_select(long long nd, long long in,
+                                                   long long ou, long long ex,
+                                                   long long tv);
+void __sanitizer_syscall_post_impl_compat_50_select(long long res, long long nd,
+                                                    long long in, long long ou,
+                                                    long long ex, long long tv);
+/* syscall 94 has been skipped */
+void __sanitizer_syscall_pre_impl_fsync(long long fd);
+void __sanitizer_syscall_post_impl_fsync(long long res, long long fd);
+void __sanitizer_syscall_pre_impl_setpriority(long long which, long long who,
+                                              long long prio);
+void __sanitizer_syscall_post_impl_setpriority(long long res, long long which,
+                                               long long who, long long prio);
+void __sanitizer_syscall_pre_impl_compat_30_socket(long long domain,
+                                                   long long type,
+                                                   long long protocol);
+void __sanitizer_syscall_post_impl_compat_30_socket(long long res,
+                                                    long long domain,
+                                                    long long type,
+                                                    long long protocol);
+void __sanitizer_syscall_pre_impl_connect(long long s, long long name,
+                                          long long namelen);
+void __sanitizer_syscall_post_impl_connect(long long res, long long s,
+                                           long long name, long long namelen);
+void __sanitizer_syscall_pre_impl_compat_43_oaccept(long long s, long long name,
+                                                    long long anamelen);
+void __sanitizer_syscall_post_impl_compat_43_oaccept(long long res, long long s,
+                                                     long long name,
+                                                     long long anamelen);
+void __sanitizer_syscall_pre_impl_getpriority(long long which, long long who);
+void __sanitizer_syscall_post_impl_getpriority(long long res, long long which,
+                                               long long who);
+void __sanitizer_syscall_pre_impl_compat_43_osend(long long s, long long buf,
+                                                  long long len,
+                                                  long long flags);
+void __sanitizer_syscall_post_impl_compat_43_osend(long long res, long long s,
+                                                   long long buf, long long len,
+                                                   long long flags);
+void __sanitizer_syscall_pre_impl_compat_43_orecv(long long s, long long buf,
+                                                  long long len,
+                                                  long long flags);
+void __sanitizer_syscall_post_impl_compat_43_orecv(long long res, long long s,
+                                                   long long buf, long long len,
+                                                   long long flags);
+void __sanitizer_syscall_pre_impl_compat_13_sigreturn13(long long sigcntxp);
+void __sanitizer_syscall_post_impl_compat_13_sigreturn13(long long res,
+                                                         long long sigcntxp);
+void __sanitizer_syscall_pre_impl_bind(long long s, long long name,
+                                       long long namelen);
+void __sanitizer_syscall_post_impl_bind(long long res, long long s,
+                                        long long name, long long namelen);
+void __sanitizer_syscall_pre_impl_setsockopt(long long s, long long level,
+                                             long long name, long long val,
+                                             long long valsize);
+void __sanitizer_syscall_post_impl_setsockopt(long long res, long long s,
+                                              long long level, long long name,
+                                              long long val, long long valsize);
+void __sanitizer_syscall_pre_impl_listen(long long s, long long backlog);
+void __sanitizer_syscall_post_impl_listen(long long res, long long s,
+                                          long long backlog);
+/* syscall 107 has been skipped */
+void __sanitizer_syscall_pre_impl_compat_43_osigvec(long long signum,
+                                                    long long nsv,
+                                                    long long osv);
+void __sanitizer_syscall_post_impl_compat_43_osigvec(long long res,
+                                                     long long signum,
+                                                     long long nsv,
+                                                     long long osv);
+void __sanitizer_syscall_pre_impl_compat_43_osigblock(long long mask);
+void __sanitizer_syscall_post_impl_compat_43_osigblock(long long res,
+                                                       long long mask);
+void __sanitizer_syscall_pre_impl_compat_43_osigsetmask(long long mask);
+void __sanitizer_syscall_post_impl_compat_43_osigsetmask(long long res,
+                                                         long long mask);
+void __sanitizer_syscall_pre_impl_compat_13_sigsuspend13(long long mask);
+void __sanitizer_syscall_post_impl_compat_13_sigsuspend13(long long res,
+                                                          long long mask);
+void __sanitizer_syscall_pre_impl_compat_43_osigstack(long long nss,
+                                                      long long oss);
+void __sanitizer_syscall_post_impl_compat_43_osigstack(long long res,
+                                                       long long nss,
+                                                       long long oss);
+void __sanitizer_syscall_pre_impl_compat_43_orecvmsg(long long s, long long msg,
+                                                     long long flags);
+void __sanitizer_syscall_post_impl_compat_43_orecvmsg(long long res,
+                                                      long long s,
+                                                      long long msg,
+                                                      long long flags);
+void __sanitizer_syscall_pre_impl_compat_43_osendmsg(long long s, long long msg,
+                                                     long long flags);
+void __sanitizer_syscall_post_impl_compat_43_osendmsg(long long res,
+                                                      long long s,
+                                                      long long msg,
+                                                      long long flags);
+/* syscall 115 has been skipped */
+void __sanitizer_syscall_pre_impl_compat_50_gettimeofday(long long tp,
+                                                         long long tzp);
+void __sanitizer_syscall_post_impl_compat_50_gettimeofday(long long res,
+                                                          long long tp,
+                                                          long long tzp);
+void __sanitizer_syscall_pre_impl_compat_50_getrusage(long long who,
+                                                      long long rusage);
+void __sanitizer_syscall_post_impl_compat_50_getrusage(long long res,
+                                                       long long who,
+                                                       long long rusage);
+void __sanitizer_syscall_pre_impl_getsockopt(long long s, long long level,
+                                             long long name, long long val,
+                                             long long avalsize);
+void __sanitizer_syscall_post_impl_getsockopt(long long res, long long s,
+                                              long long level, long long name,
+                                              long long val,
+                                              long long avalsize);
+/* syscall 119 has been skipped */
+void __sanitizer_syscall_pre_impl_readv(long long fd, long long iovp,
+                                        long long iovcnt);
+void __sanitizer_syscall_post_impl_readv(long long res, long long fd,
+                                         long long iovp, long long iovcnt);
+void __sanitizer_syscall_pre_impl_writev(long long fd, long long iovp,
+                                         long long iovcnt);
+void __sanitizer_syscall_post_impl_writev(long long res, long long fd,
+                                          long long iovp, long long iovcnt);
+void __sanitizer_syscall_pre_impl_compat_50_settimeofday(long long tv,
+                                                         long long tzp);
+void __sanitizer_syscall_post_impl_compat_50_settimeofday(long long res,
+                                                          long long tv,
+                                                          long long tzp);
+void __sanitizer_syscall_pre_impl_fchown(long long fd, long long uid,
+                                         long long gid);
+void __sanitizer_syscall_post_impl_fchown(long long res, long long fd,
+                                          long long uid, long long gid);
+void __sanitizer_syscall_pre_impl_fchmod(long long fd, long long mode);
+void __sanitizer_syscall_post_impl_fchmod(long long res, long long fd,
+                                          long long mode);
+void __sanitizer_syscall_pre_impl_compat_43_orecvfrom(
+    long long s, long long buf, long long len, long long flags, long long from,
+    long long fromlenaddr);
+void __sanitizer_syscall_post_impl_compat_43_orecvfrom(
+    long long res, long long s, long long buf, long long len, long long flags,
+    long long from, long long fromlenaddr);
+void __sanitizer_syscall_pre_impl_setreuid(long long ruid, long long euid);
+void __sanitizer_syscall_post_impl_setreuid(long long res, long long ruid,
+                                            long long euid);
+void __sanitizer_syscall_pre_impl_setregid(long long rgid, long long egid);
+void __sanitizer_syscall_post_impl_setregid(long long res, long long rgid,
+                                            long long egid);
+void __sanitizer_syscall_pre_impl_rename(long long from, long long to);
+void __sanitizer_syscall_post_impl_rename(long long res, long long from,
+                                          long long to);
+void __sanitizer_syscall_pre_impl_compat_43_otruncate(long long path,
+                                                      long long length);
+void __sanitizer_syscall_post_impl_compat_43_otruncate(long long res,
+                                                       long long path,
+                                                       long long length);
+void __sanitizer_syscall_pre_impl_compat_43_oftruncate(long long fd,
+                                                       long long length);
+void __sanitizer_syscall_post_impl_compat_43_oftruncate(long long res,
+                                                        long long fd,
+                                                        long long length);
+void __sanitizer_syscall_pre_impl_flock(long long fd, long long how);
+void __sanitizer_syscall_post_impl_flock(long long res, long long fd,
+                                         long long how);
+void __sanitizer_syscall_pre_impl_mkfifo(long long path, long long mode);
+void __sanitizer_syscall_post_impl_mkfifo(long long res, long long path,
+                                          long long mode);
+void __sanitizer_syscall_pre_impl_sendto(long long s, long long buf,
+                                         long long len, long long flags,
+                                         long long to, long long tolen);
+void __sanitizer_syscall_post_impl_sendto(long long res, long long s,
+                                          long long buf, long long len,
+                                          long long flags, long long to,
+                                          long long tolen);
+void __sanitizer_syscall_pre_impl_shutdown(long long s, long long how);
+void __sanitizer_syscall_post_impl_shutdown(long long res, long long s,
+                                            long long how);
+void __sanitizer_syscall_pre_impl_socketpair(long long domain, long long type,
+                                             long long protocol, long long rsv);
+void __sanitizer_syscall_post_impl_socketpair(long long res, long long domain,
+                                              long long type,
+                                              long long protocol,
+                                              long long rsv);
+void __sanitizer_syscall_pre_impl_mkdir(long long path, long long mode);
+void __sanitizer_syscall_post_impl_mkdir(long long res, long long path,
+                                         long long mode);
+void __sanitizer_syscall_pre_impl_rmdir(long long path);
+void __sanitizer_syscall_post_impl_rmdir(long long res, long long path);
+void __sanitizer_syscall_pre_impl_compat_50_utimes(long long path,
+                                                   long long tptr);
+void __sanitizer_syscall_post_impl_compat_50_utimes(long long res,
+                                                    long long path,
+                                                    long long tptr);
+/* syscall 139 has been skipped */
+void __sanitizer_syscall_pre_impl_compat_50_adjtime(long long delta,
+                                                    long long olddelta);
+void __sanitizer_syscall_post_impl_compat_50_adjtime(long long res,
+                                                     long long delta,
+                                                     long long olddelta);
+void __sanitizer_syscall_pre_impl_compat_43_ogetpeername(long long fdes,
+                                                         long long asa,
+                                                         long long alen);
+void __sanitizer_syscall_post_impl_compat_43_ogetpeername(long long res,
+                                                          long long fdes,
+                                                          long long asa,
+                                                          long long alen);
+void __sanitizer_syscall_pre_impl_compat_43_ogethostid(void);
+void __sanitizer_syscall_post_impl_compat_43_ogethostid(long long res);
+void __sanitizer_syscall_pre_impl_compat_43_osethostid(long long hostid);
+void __sanitizer_syscall_post_impl_compat_43_osethostid(long long res,
+                                                        long long hostid);
+void __sanitizer_syscall_pre_impl_compat_43_ogetrlimit(long long which,
+                                                       long long rlp);
+void __sanitizer_syscall_post_impl_compat_43_ogetrlimit(long long res,
+                                                        long long which,
+                                                        long long rlp);
+void __sanitizer_syscall_pre_impl_compat_43_osetrlimit(long long which,
+                                                       long long rlp);
+void __sanitizer_syscall_post_impl_compat_43_osetrlimit(long long res,
+                                                        long long which,
+                                                        long long rlp);
+void __sanitizer_syscall_pre_impl_compat_43_okillpg(long long pgid,
+                                                    long long signum);
+void __sanitizer_syscall_post_impl_compat_43_okillpg(long long res,
+                                                     long long pgid,
+                                                     long long signum);
+void __sanitizer_syscall_pre_impl_setsid(void);
+void __sanitizer_syscall_post_impl_setsid(long long res);
+void __sanitizer_syscall_pre_impl_compat_50_quotactl(long long path,
+                                                     long long cmd,
+                                                     long long uid,
+                                                     long long arg);
+void __sanitizer_syscall_post_impl_compat_50_quotactl(
+    long long res, long long path, long long cmd, long long uid, long long arg);
+void __sanitizer_syscall_pre_impl_compat_43_oquota(void);
+void __sanitizer_syscall_post_impl_compat_43_oquota(long long res);
+void __sanitizer_syscall_pre_impl_compat_43_ogetsockname(long long fdec,
+                                                         long long asa,
+                                                         long long alen);
+void __sanitizer_syscall_post_impl_compat_43_ogetsockname(long long res,
+                                                          long long fdec,
+                                                          long long asa,
+                                                          long long alen);
+/* syscall 151 has been skipped */
+/* syscall 152 has been skipped */
+/* syscall 153 has been skipped */
+/* syscall 154 has been skipped */
+void __sanitizer_syscall_pre_impl_nfssvc(long long flag, long long argp);
+void __sanitizer_syscall_post_impl_nfssvc(long long res, long long flag,
+                                          long long argp);
+void __sanitizer_syscall_pre_impl_compat_43_ogetdirentries(long long fd,
+                                                           long long buf,
+                                                           long long count,
+                                                           long long basep);
+void __sanitizer_syscall_post_impl_compat_43_ogetdirentries(long long res,
+                                                            long long fd,
+                                                            long long buf,
+                                                            long long count,
+                                                            long long basep);
+void __sanitizer_syscall_pre_impl_compat_20_statfs(long long path,
+                                                   long long buf);
+void __sanitizer_syscall_post_impl_compat_20_statfs(long long res,
+                                                    long long path,
+                                                    long long buf);
+void __sanitizer_syscall_pre_impl_compat_20_fstatfs(long long fd,
+                                                    long long buf);
+void __sanitizer_syscall_post_impl_compat_20_fstatfs(long long res,
+                                                     long long fd,
+                                                     long long buf);
+/* syscall 159 has been skipped */
+/* syscall 160 has been skipped */
+void __sanitizer_syscall_pre_impl_compat_30_getfh(long long fname,
+                                                  long long fhp);
+void __sanitizer_syscall_post_impl_compat_30_getfh(long long res,
+                                                   long long fname,
+                                                   long long fhp);
+void __sanitizer_syscall_pre_impl_compat_09_ogetdomainname(long long domainname,
+                                                           long long len);
+void __sanitizer_syscall_post_impl_compat_09_ogetdomainname(
+    long long res, long long domainname, long long len);
+void __sanitizer_syscall_pre_impl_compat_09_osetdomainname(long long domainname,
+                                                           long long len);
+void __sanitizer_syscall_post_impl_compat_09_osetdomainname(
+    long long res, long long domainname, long long len);
+void __sanitizer_syscall_pre_impl_compat_09_ouname(long long name);
+void __sanitizer_syscall_post_impl_compat_09_ouname(long long res,
+                                                    long long name);
+void __sanitizer_syscall_pre_impl_sysarch(long long op, long long parms);
+void __sanitizer_syscall_post_impl_sysarch(long long res, long long op,
+                                           long long parms);
+/* syscall 166 has been skipped */
+/* syscall 167 has been skipped */
+/* syscall 168 has been skipped */
+#if !defined(_LP64)
+void __sanitizer_syscall_pre_impl_compat_10_osemsys(long long which,
+                                                    long long a2, long long a3,
+                                                    long long a4, long long a5);
+void __sanitizer_syscall_post_impl_compat_10_osemsys(long long res,
+                                                     long long which,
+                                                     long long a2, long long a3,
+                                                     long long a4,
+                                                     long long a5);
+#else
+/* syscall 169 has been skipped */
+#endif
+#if !defined(_LP64)
+void __sanitizer_syscall_pre_impl_compat_10_omsgsys(long long which,
+                                                    long long a2, long long a3,
+                                                    long long a4, long long a5,
+                                                    long long a6);
+void __sanitizer_syscall_post_impl_compat_10_omsgsys(long long res,
+                                                     long long which,
+                                                     long long a2, long long a3,
+                                                     long long a4, long long a5,
+                                                     long long a6);
+#else
+/* syscall 170 has been skipped */
+#endif
+#if !defined(_LP64)
+void __sanitizer_syscall_pre_impl_compat_10_oshmsys(long long which,
+                                                    long long a2, long long a3,
+                                                    long long a4);
+void __sanitizer_syscall_post_impl_compat_10_oshmsys(long long res,
+                                                     long long which,
+                                                     long long a2, long long a3,
+                                                     long long a4);
+#else
+/* syscall 171 has been skipped */
+#endif
+/* syscall 172 has been skipped */
+void __sanitizer_syscall_pre_impl_pread(long long fd, long long buf,
+                                        long long nbyte, long long PAD,
+                                        long long offset);
+void __sanitizer_syscall_post_impl_pread(long long res, long long fd,
+                                         long long buf, long long nbyte,
+                                         long long PAD, long long offset);
+void __sanitizer_syscall_pre_impl_pwrite(long long fd, long long buf,
+                                         long long nbyte, long long PAD,
+                                         long long offset);
+void __sanitizer_syscall_post_impl_pwrite(long long res, long long fd,
+                                          long long buf, long long nbyte,
+                                          long long PAD, long long offset);
+void __sanitizer_syscall_pre_impl_compat_30_ntp_gettime(long long ntvp);
+void __sanitizer_syscall_post_impl_compat_30_ntp_gettime(long long res,
+                                                         long long ntvp);
+#if defined(NTP) || !defined(_KERNEL_OPT)
+void __sanitizer_syscall_pre_impl_ntp_adjtime(long long tp);
+void __sanitizer_syscall_post_impl_ntp_adjtime(long long res, long long tp);
+#else
+/* syscall 176 has been skipped */
+#endif
+/* syscall 177 has been skipped */
+/* syscall 178 has been skipped */
+/* syscall 179 has been skipped */
+/* syscall 180 has been skipped */
+void __sanitizer_syscall_pre_impl_setgid(long long gid);
+void __sanitizer_syscall_post_impl_setgid(long long res, long long gid);
+void __sanitizer_syscall_pre_impl_setegid(long long egid);
+void __sanitizer_syscall_post_impl_setegid(long long res, long long egid);
+void __sanitizer_syscall_pre_impl_seteuid(long long euid);
+void __sanitizer_syscall_post_impl_seteuid(long long res, long long euid);
+void __sanitizer_syscall_pre_impl_lfs_bmapv(long long fsidp, long long blkiov,
+                                            long long blkcnt);
+void __sanitizer_syscall_post_impl_lfs_bmapv(long long res, long long fsidp,
+                                             long long blkiov,
+                                             long long blkcnt);
+void __sanitizer_syscall_pre_impl_lfs_markv(long long fsidp, long long blkiov,
+                                            long long blkcnt);
+void __sanitizer_syscall_post_impl_lfs_markv(long long res, long long fsidp,
+                                             long long blkiov,
+                                             long long blkcnt);
+void __sanitizer_syscall_pre_impl_lfs_segclean(long long fsidp,
+                                               long long segment);
+void __sanitizer_syscall_post_impl_lfs_segclean(long long res, long long fsidp,
+                                                long long segment);
+void __sanitizer_syscall_pre_impl_compat_50_lfs_segwait(long long fsidp,
+                                                        long long tv);
+void __sanitizer_syscall_post_impl_compat_50_lfs_segwait(long long res,
+                                                         long long fsidp,
+                                                         long long tv);
+void __sanitizer_syscall_pre_impl_compat_12_stat12(long long path,
+                                                   long long ub);
+void __sanitizer_syscall_post_impl_compat_12_stat12(long long res,
+                                                    long long path,
+                                                    long long ub);
+void __sanitizer_syscall_pre_impl_compat_12_fstat12(long long fd, long long sb);
+void __sanitizer_syscall_post_impl_compat_12_fstat12(long long res,
+                                                     long long fd,
+                                                     long long sb);
+void __sanitizer_syscall_pre_impl_compat_12_lstat12(long long path,
+                                                    long long ub);
+void __sanitizer_syscall_post_impl_compat_12_lstat12(long long res,
+                                                     long long path,
+                                                     long long ub);
+void __sanitizer_syscall_pre_impl_pathconf(long long path, long long name);
+void __sanitizer_syscall_post_impl_pathconf(long long res, long long path,
+                                            long long name);
+void __sanitizer_syscall_pre_impl_fpathconf(long long fd, long long name);
+void __sanitizer_syscall_post_impl_fpathconf(long long res, long long fd,
+                                             long long name);
+/* syscall 193 has been skipped */
+void __sanitizer_syscall_pre_impl_getrlimit(long long which, long long rlp);
+void __sanitizer_syscall_post_impl_getrlimit(long long res, long long which,
+                                             long long rlp);
+void __sanitizer_syscall_pre_impl_setrlimit(long long which, long long rlp);
+void __sanitizer_syscall_post_impl_setrlimit(long long res, long long which,
+                                             long long rlp);
+void __sanitizer_syscall_pre_impl_compat_12_getdirentries(long long fd,
+                                                          long long buf,
+                                                          long long count,
+                                                          long long basep);
+void __sanitizer_syscall_post_impl_compat_12_getdirentries(long long res,
+                                                           long long fd,
+                                                           long long buf,
+                                                           long long count,
+                                                           long long basep);
+void __sanitizer_syscall_pre_impl_mmap(long long addr, long long len,
+                                       long long prot, long long flags,
+                                       long long fd, long long PAD,
+                                       long long pos);
+void __sanitizer_syscall_post_impl_mmap(long long res, long long addr,
+                                        long long len, long long prot,
+                                        long long flags, long long fd,
+                                        long long PAD, long long pos);
+void __sanitizer_syscall_pre_impl___syscall(long long code, long long arg0,
+                                            long long arg1, long long arg2,
+                                            long long arg3, long long arg4,
+                                            long long arg5, long long arg6,
+                                            long long arg7);
+void __sanitizer_syscall_post_impl___syscall(long long res, long long code,
+                                             long long arg0, long long arg1,
+                                             long long arg2, long long arg3,
+                                             long long arg4, long long arg5,
+                                             long long arg6, long long arg7);
+void __sanitizer_syscall_pre_impl_lseek(long long fd, long long PAD,
+                                        long long offset, long long whence);
+void __sanitizer_syscall_post_impl_lseek(long long res, long long fd,
+                                         long long PAD, long long offset,
+                                         long long whence);
+void __sanitizer_syscall_pre_impl_truncate(long long path, long long PAD,
+                                           long long length);
+void __sanitizer_syscall_post_impl_truncate(long long res, long long path,
+                                            long long PAD, long long length);
+void __sanitizer_syscall_pre_impl_ftruncate(long long fd, long long PAD,
+                                            long long length);
+void __sanitizer_syscall_post_impl_ftruncate(long long res, long long fd,
+                                             long long PAD, long long length);
+void __sanitizer_syscall_pre_impl___sysctl(long long name, long long namelen,
+                                           long long oldv, long long oldlenp,
+                                           long long newv, long long newlen);
+void __sanitizer_syscall_post_impl___sysctl(long long res, long long name,
+                                            long long namelen, long long oldv,
+                                            long long oldlenp, long long newv,
+                                            long long newlen);
+void __sanitizer_syscall_pre_impl_mlock(long long addr, long long len);
+void __sanitizer_syscall_post_impl_mlock(long long res, long long addr,
+                                         long long len);
+void __sanitizer_syscall_pre_impl_munlock(long long addr, long long len);
+void __sanitizer_syscall_post_impl_munlock(long long res, long long addr,
+                                           long long len);
+void __sanitizer_syscall_pre_impl_undelete(long long path);
+void __sanitizer_syscall_post_impl_undelete(long long res, long long path);
+void __sanitizer_syscall_pre_impl_compat_50_futimes(long long fd,
+                                                    long long tptr);
+void __sanitizer_syscall_post_impl_compat_50_futimes(long long res,
+                                                     long long fd,
+                                                     long long tptr);
+void __sanitizer_syscall_pre_impl_getpgid(long long pid);
+void __sanitizer_syscall_post_impl_getpgid(long long res, long long pid);
+void __sanitizer_syscall_pre_impl_reboot(long long opt, long long bootstr);
+void __sanitizer_syscall_post_impl_reboot(long long res, long long opt,
+                                          long long bootstr);
+void __sanitizer_syscall_pre_impl_poll(long long fds, long long nfds,
+                                       long long timeout);
+void __sanitizer_syscall_post_impl_poll(long long res, long long fds,
+                                        long long nfds, long long timeout);
+void __sanitizer_syscall_pre_impl_afssys(long long id, long long a1,
+                                         long long a2, long long a3,
+                                         long long a4, long long a5,
+                                         long long a6);
+void __sanitizer_syscall_post_impl_afssys(long long res, long long id,
+                                          long long a1, long long a2,
+                                          long long a3, long long a4,
+                                          long long a5, long long a6);
+/* syscall 211 has been skipped */
+/* syscall 212 has been skipped */
+/* syscall 213 has been skipped */
+/* syscall 214 has been skipped */
+/* syscall 215 has been skipped */
+/* syscall 216 has been skipped */
+/* syscall 217 has been skipped */
+/* syscall 218 has been skipped */
+/* syscall 219 has been skipped */
+void __sanitizer_syscall_pre_impl_compat_14___semctl(long long semid,
+                                                     long long semnum,
+                                                     long long cmd,
+                                                     long long arg);
+void __sanitizer_syscall_post_impl_compat_14___semctl(long long res,
+                                                      long long semid,
+                                                      long long semnum,
+                                                      long long cmd,
+                                                      long long arg);
+void __sanitizer_syscall_pre_impl_semget(long long key, long long nsems,
+                                         long long semflg);
+void __sanitizer_syscall_post_impl_semget(long long res, long long key,
+                                          long long nsems, long long semflg);
+void __sanitizer_syscall_pre_impl_semop(long long semid, long long sops,
+                                        long long nsops);
+void __sanitizer_syscall_post_impl_semop(long long res, long long semid,
+                                         long long sops, long long nsops);
+void __sanitizer_syscall_pre_impl_semconfig(long long flag);
+void __sanitizer_syscall_post_impl_semconfig(long long res, long long flag);
+void __sanitizer_syscall_pre_impl_compat_14_msgctl(long long msqid,
+                                                   long long cmd,
+                                                   long long buf);
+void __sanitizer_syscall_post_impl_compat_14_msgctl(long long res,
+                                                    long long msqid,
+                                                    long long cmd,
+                                                    long long buf);
+void __sanitizer_syscall_pre_impl_msgget(long long key, long long msgflg);
+void __sanitizer_syscall_post_impl_msgget(long long res, long long key,
+                                          long long msgflg);
+void __sanitizer_syscall_pre_impl_msgsnd(long long msqid, long long msgp,
+                                         long long msgsz, long long msgflg);
+void __sanitizer_syscall_post_impl_msgsnd(long long res, long long msqid,
+                                          long long msgp, long long msgsz,
+                                          long long msgflg);
+void __sanitizer_syscall_pre_impl_msgrcv(long long msqid, long long msgp,
+                                         long long msgsz, long long msgtyp,
+                                         long long msgflg);
+void __sanitizer_syscall_post_impl_msgrcv(long long res, long long msqid,
+                                          long long msgp, long long msgsz,
+                                          long long msgtyp, long long msgflg);
+void __sanitizer_syscall_pre_impl_shmat(long long shmid, long long shmaddr,
+                                        long long shmflg);
+void __sanitizer_syscall_post_impl_shmat(long long res, long long shmid,
+                                         long long shmaddr, long long shmflg);
+void __sanitizer_syscall_pre_impl_compat_14_shmctl(long long shmid,
+                                                   long long cmd,
+                                                   long long buf);
+void __sanitizer_syscall_post_impl_compat_14_shmctl(long long res,
+                                                    long long shmid,
+                                                    long long cmd,
+                                                    long long buf);
+void __sanitizer_syscall_pre_impl_shmdt(long long shmaddr);
+void __sanitizer_syscall_post_impl_shmdt(long long res, long long shmaddr);
+void __sanitizer_syscall_pre_impl_shmget(long long key, long long size,
+                                         long long shmflg);
+void __sanitizer_syscall_post_impl_shmget(long long res, long long key,
+                                          long long size, long long shmflg);
+void __sanitizer_syscall_pre_impl_compat_50_clock_gettime(long long clock_id,
+                                                          long long tp);
+void __sanitizer_syscall_post_impl_compat_50_clock_gettime(long long res,
+                                                           long long clock_id,
+                                                           long long tp);
+void __sanitizer_syscall_pre_impl_compat_50_clock_settime(long long clock_id,
+                                                          long long tp);
+void __sanitizer_syscall_post_impl_compat_50_clock_settime(long long res,
+                                                           long long clock_id,
+                                                           long long tp);
+void __sanitizer_syscall_pre_impl_compat_50_clock_getres(long long clock_id,
+                                                         long long tp);
+void __sanitizer_syscall_post_impl_compat_50_clock_getres(long long res,
+                                                          long long clock_id,
+                                                          long long tp);
+void __sanitizer_syscall_pre_impl_timer_create(long long clock_id,
+                                               long long evp,
+                                               long long timerid);
+void __sanitizer_syscall_post_impl_timer_create(long long res,
+                                                long long clock_id,
+                                                long long evp,
+                                                long long timerid);
+void __sanitizer_syscall_pre_impl_timer_delete(long long timerid);
+void __sanitizer_syscall_post_impl_timer_delete(long long res,
+                                                long long timerid);
+void __sanitizer_syscall_pre_impl_compat_50_timer_settime(long long timerid,
+                                                          long long flags,
+                                                          long long value,
+                                                          long long ovalue);
+void __sanitizer_syscall_post_impl_compat_50_timer_settime(long long res,
+                                                           long long timerid,
+                                                           long long flags,
+                                                           long long value,
+                                                           long long ovalue);
+void __sanitizer_syscall_pre_impl_compat_50_timer_gettime(long long timerid,
+                                                          long long value);
+void __sanitizer_syscall_post_impl_compat_50_timer_gettime(long long res,
+                                                           long long timerid,
+                                                           long long value);
+void __sanitizer_syscall_pre_impl_timer_getoverrun(long long timerid);
+void __sanitizer_syscall_post_impl_timer_getoverrun(long long res,
+                                                    long long timerid);
+void __sanitizer_syscall_pre_impl_compat_50_nanosleep(long long rqtp,
+                                                      long long rmtp);
+void __sanitizer_syscall_post_impl_compat_50_nanosleep(long long res,
+                                                       long long rqtp,
+                                                       long long rmtp);
+void __sanitizer_syscall_pre_impl_fdatasync(long long fd);
+void __sanitizer_syscall_post_impl_fdatasync(long long res, long long fd);
+void __sanitizer_syscall_pre_impl_mlockall(long long flags);
+void __sanitizer_syscall_post_impl_mlockall(long long res, long long flags);
+void __sanitizer_syscall_pre_impl_munlockall(void);
+void __sanitizer_syscall_post_impl_munlockall(long long res);
+void __sanitizer_syscall_pre_impl_compat_50___sigtimedwait(long long set,
+                                                           long long info,
+                                                           long long timeout);
+void __sanitizer_syscall_post_impl_compat_50___sigtimedwait(long long res,
+                                                            long long set,
+                                                            long long info,
+                                                            long long timeout);
+void __sanitizer_syscall_pre_impl_sigqueueinfo(long long pid, long long info);
+void __sanitizer_syscall_post_impl_sigqueueinfo(long long res, long long pid,
+                                                long long info);
+void __sanitizer_syscall_pre_impl_modctl(long long cmd, long long arg);
+void __sanitizer_syscall_post_impl_modctl(long long res, long long cmd,
+                                          long long arg);
+void __sanitizer_syscall_pre_impl__ksem_init(long long value, long long idp);
+void __sanitizer_syscall_post_impl__ksem_init(long long res, long long value,
+                                              long long idp);
+void __sanitizer_syscall_pre_impl__ksem_open(long long name, long long oflag,
+                                             long long mode, long long value,
+                                             long long idp);
+void __sanitizer_syscall_post_impl__ksem_open(long long res, long long name,
+                                              long long oflag, long long mode,
+                                              long long value, long long idp);
+void __sanitizer_syscall_pre_impl__ksem_unlink(long long name);
+void __sanitizer_syscall_post_impl__ksem_unlink(long long res, long long name);
+void __sanitizer_syscall_pre_impl__ksem_close(long long id);
+void __sanitizer_syscall_post_impl__ksem_close(long long res, long long id);
+void __sanitizer_syscall_pre_impl__ksem_post(long long id);
+void __sanitizer_syscall_post_impl__ksem_post(long long res, long long id);
+void __sanitizer_syscall_pre_impl__ksem_wait(long long id);
+void __sanitizer_syscall_post_impl__ksem_wait(long long res, long long id);
+void __sanitizer_syscall_pre_impl__ksem_trywait(long long id);
+void __sanitizer_syscall_post_impl__ksem_trywait(long long res, long long id);
+void __sanitizer_syscall_pre_impl__ksem_getvalue(long long id, long long value);
+void __sanitizer_syscall_post_impl__ksem_getvalue(long long res, long long id,
+                                                  long long value);
+void __sanitizer_syscall_pre_impl__ksem_destroy(long long id);
+void __sanitizer_syscall_post_impl__ksem_destroy(long long res, long long id);
+void __sanitizer_syscall_pre_impl__ksem_timedwait(long long id,
+                                                  long long abstime);
+void __sanitizer_syscall_post_impl__ksem_timedwait(long long res, long long id,
+                                                   long long abstime);
+void __sanitizer_syscall_pre_impl_mq_open(long long name, long long oflag,
+                                          long long mode, long long attr);
+void __sanitizer_syscall_post_impl_mq_open(long long res, long long name,
+                                           long long oflag, long long mode,
+                                           long long attr);
+void __sanitizer_syscall_pre_impl_mq_close(long long mqdes);
+void __sanitizer_syscall_post_impl_mq_close(long long res, long long mqdes);
+void __sanitizer_syscall_pre_impl_mq_unlink(long long name);
+void __sanitizer_syscall_post_impl_mq_unlink(long long res, long long name);
+void __sanitizer_syscall_pre_impl_mq_getattr(long long mqdes, long long mqstat);
+void __sanitizer_syscall_post_impl_mq_getattr(long long res, long long mqdes,
+                                              long long mqstat);
+void __sanitizer_syscall_pre_impl_mq_setattr(long long mqdes, long long mqstat,
+                                             long long omqstat);
+void __sanitizer_syscall_post_impl_mq_setattr(long long res, long long mqdes,
+                                              long long mqstat,
+                                              long long omqstat);
+void __sanitizer_syscall_pre_impl_mq_notify(long long mqdes,
+                                            long long notification);
+void __sanitizer_syscall_post_impl_mq_notify(long long res, long long mqdes,
+                                             long long notification);
+void __sanitizer_syscall_pre_impl_mq_send(long long mqdes, long long msg_ptr,
+                                          long long msg_len,
+                                          long long msg_prio);
+void __sanitizer_syscall_post_impl_mq_send(long long res, long long mqdes,
+                                           long long msg_ptr, long long msg_len,
+                                           long long msg_prio);
+void __sanitizer_syscall_pre_impl_mq_receive(long long mqdes, long long msg_ptr,
+                                             long long msg_len,
+                                             long long msg_prio);
+void __sanitizer_syscall_post_impl_mq_receive(long long res, long long mqdes,
+                                              long long msg_ptr,
+                                              long long msg_len,
+                                              long long msg_prio);
+void __sanitizer_syscall_pre_impl_compat_50_mq_timedsend(long long mqdes,
+                                                         long long msg_ptr,
+                                                         long long msg_len,
+                                                         long long msg_prio,
+                                                         long long abs_timeout);
+void __sanitizer_syscall_post_impl_compat_50_mq_timedsend(
+    long long res, long long mqdes, long long msg_ptr, long long msg_len,
+    long long msg_prio, long long abs_timeout);
+void __sanitizer_syscall_pre_impl_compat_50_mq_timedreceive(
+    long long mqdes, long long msg_ptr, long long msg_len, long long msg_prio,
+    long long abs_timeout);
+void __sanitizer_syscall_post_impl_compat_50_mq_timedreceive(
+    long long res, long long mqdes, long long msg_ptr, long long msg_len,
+    long long msg_prio, long long abs_timeout);
+/* syscall 267 has been skipped */
+/* syscall 268 has been skipped */
+/* syscall 269 has been skipped */
+void __sanitizer_syscall_pre_impl___posix_rename(long long from, long long to);
+void __sanitizer_syscall_post_impl___posix_rename(long long res, long long from,
+                                                  long long to);
+void __sanitizer_syscall_pre_impl_swapctl(long long cmd, long long arg,
+                                          long long misc);
+void __sanitizer_syscall_post_impl_swapctl(long long res, long long cmd,
+                                           long long arg, long long misc);
+void __sanitizer_syscall_pre_impl_compat_30_getdents(long long fd,
+                                                     long long buf,
+                                                     long long count);
+void __sanitizer_syscall_post_impl_compat_30_getdents(long long res,
+                                                      long long fd,
+                                                      long long buf,
+                                                      long long count);
+void __sanitizer_syscall_pre_impl_minherit(long long addr, long long len,
+                                           long long inherit);
+void __sanitizer_syscall_post_impl_minherit(long long res, long long addr,
+                                            long long len, long long inherit);
+void __sanitizer_syscall_pre_impl_lchmod(long long path, long long mode);
+void __sanitizer_syscall_post_impl_lchmod(long long res, long long path,
+                                          long long mode);
+void __sanitizer_syscall_pre_impl_lchown(long long path, long long uid,
+                                         long long gid);
+void __sanitizer_syscall_post_impl_lchown(long long res, long long path,
+                                          long long uid, long long gid);
+void __sanitizer_syscall_pre_impl_compat_50_lutimes(long long path,
+                                                    long long tptr);
+void __sanitizer_syscall_post_impl_compat_50_lutimes(long long res,
+                                                     long long path,
+                                                     long long tptr);
+void __sanitizer_syscall_pre_impl___msync13(long long addr, long long len,
+                                            long long flags);
+void __sanitizer_syscall_post_impl___msync13(long long res, long long addr,
+                                             long long len, long long flags);
+void __sanitizer_syscall_pre_impl_compat_30___stat13(long long path,
+                                                     long long ub);
+void __sanitizer_syscall_post_impl_compat_30___stat13(long long res,
+                                                      long long path,
+                                                      long long ub);
+void __sanitizer_syscall_pre_impl_compat_30___fstat13(long long fd,
+                                                      long long sb);
+void __sanitizer_syscall_post_impl_compat_30___fstat13(long long res,
+                                                       long long fd,
+                                                       long long sb);
+void __sanitizer_syscall_pre_impl_compat_30___lstat13(long long path,
+                                                      long long ub);
+void __sanitizer_syscall_post_impl_compat_30___lstat13(long long res,
+                                                       long long path,
+                                                       long long ub);
+void __sanitizer_syscall_pre_impl___sigaltstack14(long long nss, long long oss);
+void __sanitizer_syscall_post_impl___sigaltstack14(long long res, long long nss,
+                                                   long long oss);
+void __sanitizer_syscall_pre_impl___vfork14(void);
+void __sanitizer_syscall_post_impl___vfork14(long long res);
+void __sanitizer_syscall_pre_impl___posix_chown(long long path, long long uid,
+                                                long long gid);
+void __sanitizer_syscall_post_impl___posix_chown(long long res, long long path,
+                                                 long long uid, long long gid);
+void __sanitizer_syscall_pre_impl___posix_fchown(long long fd, long long uid,
+                                                 long long gid);
+void __sanitizer_syscall_post_impl___posix_fchown(long long res, long long fd,
+                                                  long long uid, long long gid);
+void __sanitizer_syscall_pre_impl___posix_lchown(long long path, long long uid,
+                                                 long long gid);
+void __sanitizer_syscall_post_impl___posix_lchown(long long res, long long path,
+                                                  long long uid, long long gid);
+void __sanitizer_syscall_pre_impl_getsid(long long pid);
+void __sanitizer_syscall_post_impl_getsid(long long res, long long pid);
+void __sanitizer_syscall_pre_impl___clone(long long flags, long long stack);
+void __sanitizer_syscall_post_impl___clone(long long res, long long flags,
+                                           long long stack);
+void __sanitizer_syscall_pre_impl_fktrace(long long fd, long long ops,
+                                          long long facs, long long pid);
+void __sanitizer_syscall_post_impl_fktrace(long long res, long long fd,
+                                           long long ops, long long facs,
+                                           long long pid);
+void __sanitizer_syscall_pre_impl_preadv(long long fd, long long iovp,
+                                         long long iovcnt, long long PAD,
+                                         long long offset);
+void __sanitizer_syscall_post_impl_preadv(long long res, long long fd,
+                                          long long iovp, long long iovcnt,
+                                          long long PAD, long long offset);
+void __sanitizer_syscall_pre_impl_pwritev(long long fd, long long iovp,
+                                          long long iovcnt, long long PAD,
+                                          long long offset);
+void __sanitizer_syscall_post_impl_pwritev(long long res, long long fd,
+                                           long long iovp, long long iovcnt,
+                                           long long PAD, long long offset);
+void __sanitizer_syscall_pre_impl_compat_16___sigaction14(long long signum,
+                                                          long long nsa,
+                                                          long long osa);
+void __sanitizer_syscall_post_impl_compat_16___sigaction14(long long res,
+                                                           long long signum,
+                                                           long long nsa,
+                                                           long long osa);
+void __sanitizer_syscall_pre_impl___sigpending14(long long set);
+void __sanitizer_syscall_post_impl___sigpending14(long long res, long long set);
+void __sanitizer_syscall_pre_impl___sigprocmask14(long long how, long long set,
+                                                  long long oset);
+void __sanitizer_syscall_post_impl___sigprocmask14(long long res, long long how,
+                                                   long long set,
+                                                   long long oset);
+void __sanitizer_syscall_pre_impl___sigsuspend14(long long set);
+void __sanitizer_syscall_post_impl___sigsuspend14(long long res, long long set);
+void __sanitizer_syscall_pre_impl_compat_16___sigreturn14(long long sigcntxp);
+void __sanitizer_syscall_post_impl_compat_16___sigreturn14(long long res,
+                                                           long long sigcntxp);
+void __sanitizer_syscall_pre_impl___getcwd(long long bufp, long long length);
+void __sanitizer_syscall_post_impl___getcwd(long long res, long long bufp,
+                                            long long length);
+void __sanitizer_syscall_pre_impl_fchroot(long long fd);
+void __sanitizer_syscall_post_impl_fchroot(long long res, long long fd);
+void __sanitizer_syscall_pre_impl_compat_30_fhopen(long long fhp,
+                                                   long long flags);
+void __sanitizer_syscall_post_impl_compat_30_fhopen(long long res,
+                                                    long long fhp,
+                                                    long long flags);
+void __sanitizer_syscall_pre_impl_compat_30_fhstat(long long fhp, long long sb);
+void __sanitizer_syscall_post_impl_compat_30_fhstat(long long res,
+                                                    long long fhp,
+                                                    long long sb);
+void __sanitizer_syscall_pre_impl_compat_20_fhstatfs(long long fhp,
+                                                     long long buf);
+void __sanitizer_syscall_post_impl_compat_20_fhstatfs(long long res,
+                                                      long long fhp,
+                                                      long long buf);
+void __sanitizer_syscall_pre_impl_compat_50_____semctl13(long long semid,
+                                                         long long semnum,
+                                                         long long cmd,
+                                                         long long arg);
+void __sanitizer_syscall_post_impl_compat_50_____semctl13(long long res,
+                                                          long long semid,
+                                                          long long semnum,
+                                                          long long cmd,
+                                                          long long arg);
+void __sanitizer_syscall_pre_impl_compat_50___msgctl13(long long msqid,
+                                                       long long cmd,
+                                                       long long buf);
+void __sanitizer_syscall_post_impl_compat_50___msgctl13(long long res,
+                                                        long long msqid,
+                                                        long long cmd,
+                                                        long long buf);
+void __sanitizer_syscall_pre_impl_compat_50___shmctl13(long long shmid,
+                                                       long long cmd,
+                                                       long long buf);
+void __sanitizer_syscall_post_impl_compat_50___shmctl13(long long res,
+                                                        long long shmid,
+                                                        long long cmd,
+                                                        long long buf);
+void __sanitizer_syscall_pre_impl_lchflags(long long path, long long flags);
+void __sanitizer_syscall_post_impl_lchflags(long long res, long long path,
+                                            long long flags);
+void __sanitizer_syscall_pre_impl_issetugid(void);
+void __sanitizer_syscall_post_impl_issetugid(long long res);
+void __sanitizer_syscall_pre_impl_utrace(long long label, long long addr,
+                                         long long len);
+void __sanitizer_syscall_post_impl_utrace(long long res, long long label,
+                                          long long addr, long long len);
+void __sanitizer_syscall_pre_impl_getcontext(long long ucp);
+void __sanitizer_syscall_post_impl_getcontext(long long res, long long ucp);
+void __sanitizer_syscall_pre_impl_setcontext(long long ucp);
+void __sanitizer_syscall_post_impl_setcontext(long long res, long long ucp);
+void __sanitizer_syscall_pre_impl__lwp_create(long long ucp, long long flags,
+                                              long long new_lwp);
+void __sanitizer_syscall_post_impl__lwp_create(long long res, long long ucp,
+                                               long long flags,
+                                               long long new_lwp);
+void __sanitizer_syscall_pre_impl__lwp_exit(void);
+void __sanitizer_syscall_post_impl__lwp_exit(long long res);
+void __sanitizer_syscall_pre_impl__lwp_self(void);
+void __sanitizer_syscall_post_impl__lwp_self(long long res);
+void __sanitizer_syscall_pre_impl__lwp_wait(long long wait_for,
+                                            long long departed);
+void __sanitizer_syscall_post_impl__lwp_wait(long long res, long long wait_for,
+                                             long long departed);
+void __sanitizer_syscall_pre_impl__lwp_suspend(long long target);
+void __sanitizer_syscall_post_impl__lwp_suspend(long long res,
+                                                long long target);
+void __sanitizer_syscall_pre_impl__lwp_continue(long long target);
+void __sanitizer_syscall_post_impl__lwp_continue(long long res,
+                                                 long long target);
+void __sanitizer_syscall_pre_impl__lwp_wakeup(long long target);
+void __sanitizer_syscall_post_impl__lwp_wakeup(long long res, long long target);
+void __sanitizer_syscall_pre_impl__lwp_getprivate(void);
+void __sanitizer_syscall_post_impl__lwp_getprivate(long long res);
+void __sanitizer_syscall_pre_impl__lwp_setprivate(long long ptr);
+void __sanitizer_syscall_post_impl__lwp_setprivate(long long res,
+                                                   long long ptr);
+void __sanitizer_syscall_pre_impl__lwp_kill(long long target, long long signo);
+void __sanitizer_syscall_post_impl__lwp_kill(long long res, long long target,
+                                             long long signo);
+void __sanitizer_syscall_pre_impl__lwp_detach(long long target);
+void __sanitizer_syscall_post_impl__lwp_detach(long long res, long long target);
+void __sanitizer_syscall_pre_impl_compat_50__lwp_park(long long ts,
+                                                      long long unpark,
+                                                      long long hint,
+                                                      long long unparkhint);
+void __sanitizer_syscall_post_impl_compat_50__lwp_park(long long res,
+                                                       long long ts,
+                                                       long long unpark,
+                                                       long long hint,
+                                                       long long unparkhint);
+void __sanitizer_syscall_pre_impl__lwp_unpark(long long target, long long hint);
+void __sanitizer_syscall_post_impl__lwp_unpark(long long res, long long target,
+                                               long long hint);
+void __sanitizer_syscall_pre_impl__lwp_unpark_all(long long targets,
+                                                  long long ntargets,
+                                                  long long hint);
+void __sanitizer_syscall_post_impl__lwp_unpark_all(long long res,
+                                                   long long targets,
+                                                   long long ntargets,
+                                                   long long hint);
+void __sanitizer_syscall_pre_impl__lwp_setname(long long target,
+                                               long long name);
+void __sanitizer_syscall_post_impl__lwp_setname(long long res, long long target,
+                                                long long name);
+void __sanitizer_syscall_pre_impl__lwp_getname(long long target, long long name,
+                                               long long len);
+void __sanitizer_syscall_post_impl__lwp_getname(long long res, long long target,
+                                                long long name, long long len);
+void __sanitizer_syscall_pre_impl__lwp_ctl(long long features,
+                                           long long address);
+void __sanitizer_syscall_post_impl__lwp_ctl(long long res, long long features,
+                                            long long address);
+/* syscall 326 has been skipped */
+/* syscall 327 has been skipped */
+/* syscall 328 has been skipped */
+/* syscall 329 has been skipped */
+void __sanitizer_syscall_pre_impl_compat_60_sa_register(
+    long long newv, long long oldv, long long flags,
+    long long stackinfo_offset);
+void __sanitizer_syscall_post_impl_compat_60_sa_register(
+    long long res, long long newv, long long oldv, long long flags,
+    long long stackinfo_offset);
+void __sanitizer_syscall_pre_impl_compat_60_sa_stacks(long long num,
+                                                      long long stacks);
+void __sanitizer_syscall_post_impl_compat_60_sa_stacks(long long res,
+                                                       long long num,
+                                                       long long stacks);
+void __sanitizer_syscall_pre_impl_compat_60_sa_enable(void);
+void __sanitizer_syscall_post_impl_compat_60_sa_enable(long long res);
+void __sanitizer_syscall_pre_impl_compat_60_sa_setconcurrency(
+    long long concurrency);
+void __sanitizer_syscall_post_impl_compat_60_sa_setconcurrency(
+    long long res, long long concurrency);
+void __sanitizer_syscall_pre_impl_compat_60_sa_yield(void);
+void __sanitizer_syscall_post_impl_compat_60_sa_yield(long long res);
+void __sanitizer_syscall_pre_impl_compat_60_sa_preempt(long long sa_id);
+void __sanitizer_syscall_post_impl_compat_60_sa_preempt(long long res,
+                                                        long long sa_id);
+/* syscall 336 has been skipped */
+/* syscall 337 has been skipped */
+/* syscall 338 has been skipped */
+/* syscall 339 has been skipped */
+void __sanitizer_syscall_pre_impl___sigaction_sigtramp(long long signum,
+                                                       long long nsa,
+                                                       long long osa,
+                                                       long long tramp,
+                                                       long long vers);
+void __sanitizer_syscall_post_impl___sigaction_sigtramp(
+    long long res, long long signum, long long nsa, long long osa,
+    long long tramp, long long vers);
+void __sanitizer_syscall_pre_impl_pmc_get_info(long long ctr, long long op,
+                                               long long args);
+void __sanitizer_syscall_post_impl_pmc_get_info(long long res, long long ctr,
+                                                long long op, long long args);
+void __sanitizer_syscall_pre_impl_pmc_control(long long ctr, long long op,
+                                              long long args);
+void __sanitizer_syscall_post_impl_pmc_control(long long res, long long ctr,
+                                               long long op, long long args);
+void __sanitizer_syscall_pre_impl_rasctl(long long addr, long long len,
+                                         long long op);
+void __sanitizer_syscall_post_impl_rasctl(long long res, long long addr,
+                                          long long len, long long op);
+void __sanitizer_syscall_pre_impl_kqueue(void);
+void __sanitizer_syscall_post_impl_kqueue(long long res);
+void __sanitizer_syscall_pre_impl_compat_50_kevent(
+    long long fd, long long changelist, long long nchanges, long long eventlist,
+    long long nevents, long long timeout);
+void __sanitizer_syscall_post_impl_compat_50_kevent(
+    long long res, long long fd, long long changelist, long long nchanges,
+    long long eventlist, long long nevents, long long timeout);
+void __sanitizer_syscall_pre_impl__sched_setparam(long long pid, long long lid,
+                                                  long long policy,
+                                                  long long params);
+void __sanitizer_syscall_post_impl__sched_setparam(long long res, long long pid,
+                                                   long long lid,
+                                                   long long policy,
+                                                   long long params);
+void __sanitizer_syscall_pre_impl__sched_getparam(long long pid, long long lid,
+                                                  long long policy,
+                                                  long long params);
+void __sanitizer_syscall_post_impl__sched_getparam(long long res, long long pid,
+                                                   long long lid,
+                                                   long long policy,
+                                                   long long params);
+void __sanitizer_syscall_pre_impl__sched_setaffinity(long long pid,
+                                                     long long lid,
+                                                     long long size,
+                                                     long long cpuset);
+void __sanitizer_syscall_post_impl__sched_setaffinity(long long res,
+                                                      long long pid,
+                                                      long long lid,
+                                                      long long size,
+                                                      long long cpuset);
+void __sanitizer_syscall_pre_impl__sched_getaffinity(long long pid,
+                                                     long long lid,
+                                                     long long size,
+                                                     long long cpuset);
+void __sanitizer_syscall_post_impl__sched_getaffinity(long long res,
+                                                      long long pid,
+                                                      long long lid,
+                                                      long long size,
+                                                      long long cpuset);
+void __sanitizer_syscall_pre_impl_sched_yield(void);
+void __sanitizer_syscall_post_impl_sched_yield(long long res);
+void __sanitizer_syscall_pre_impl__sched_protect(long long priority);
+void __sanitizer_syscall_post_impl__sched_protect(long long res,
+                                                  long long priority);
+/* syscall 352 has been skipped */
+/* syscall 353 has been skipped */
+void __sanitizer_syscall_pre_impl_fsync_range(long long fd, long long flags,
+                                              long long start,
+                                              long long length);
+void __sanitizer_syscall_post_impl_fsync_range(long long res, long long fd,
+                                               long long flags, long long start,
+                                               long long length);
+void __sanitizer_syscall_pre_impl_uuidgen(long long store, long long count);
+void __sanitizer_syscall_post_impl_uuidgen(long long res, long long store,
+                                           long long count);
+void __sanitizer_syscall_pre_impl_getvfsstat(long long buf, long long bufsize,
+                                             long long flags);
+void __sanitizer_syscall_post_impl_getvfsstat(long long res, long long buf,
+                                              long long bufsize,
+                                              long long flags);
+void __sanitizer_syscall_pre_impl_statvfs1(long long path, long long buf,
+                                           long long flags);
+void __sanitizer_syscall_post_impl_statvfs1(long long res, long long path,
+                                            long long buf, long long flags);
+void __sanitizer_syscall_pre_impl_fstatvfs1(long long fd, long long buf,
+                                            long long flags);
+void __sanitizer_syscall_post_impl_fstatvfs1(long long res, long long fd,
+                                             long long buf, long long flags);
+void __sanitizer_syscall_pre_impl_compat_30_fhstatvfs1(long long fhp,
+                                                       long long buf,
+                                                       long long flags);
+void __sanitizer_syscall_post_impl_compat_30_fhstatvfs1(long long res,
+                                                        long long fhp,
+                                                        long long buf,
+                                                        long long flags);
+void __sanitizer_syscall_pre_impl_extattrctl(long long path, long long cmd,
+                                             long long filename,
+                                             long long attrnamespace,
+                                             long long attrname);
+void __sanitizer_syscall_post_impl_extattrctl(long long res, long long path,
+                                              long long cmd, long long filename,
+                                              long long attrnamespace,
+                                              long long attrname);
+void __sanitizer_syscall_pre_impl_extattr_set_file(long long path,
+                                                   long long attrnamespace,
+                                                   long long attrname,
+                                                   long long data,
+                                                   long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_set_file(
+    long long res, long long path, long long attrnamespace, long long attrname,
+    long long data, long long nbytes);
+void __sanitizer_syscall_pre_impl_extattr_get_file(long long path,
+                                                   long long attrnamespace,
+                                                   long long attrname,
+                                                   long long data,
+                                                   long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_get_file(
+    long long res, long long path, long long attrnamespace, long long attrname,
+    long long data, long long nbytes);
+void __sanitizer_syscall_pre_impl_extattr_delete_file(long long path,
+                                                      long long attrnamespace,
+                                                      long long attrname);
+void __sanitizer_syscall_post_impl_extattr_delete_file(long long res,
+                                                       long long path,
+                                                       long long attrnamespace,
+                                                       long long attrname);
+void __sanitizer_syscall_pre_impl_extattr_set_fd(long long fd,
+                                                 long long attrnamespace,
+                                                 long long attrname,
+                                                 long long data,
+                                                 long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_set_fd(long long res, long long fd,
+                                                  long long attrnamespace,
+                                                  long long attrname,
+                                                  long long data,
+                                                  long long nbytes);
+void __sanitizer_syscall_pre_impl_extattr_get_fd(long long fd,
+                                                 long long attrnamespace,
+                                                 long long attrname,
+                                                 long long data,
+                                                 long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_get_fd(long long res, long long fd,
+                                                  long long attrnamespace,
+                                                  long long attrname,
+                                                  long long data,
+                                                  long long nbytes);
+void __sanitizer_syscall_pre_impl_extattr_delete_fd(long long fd,
+                                                    long long attrnamespace,
+                                                    long long attrname);
+void __sanitizer_syscall_post_impl_extattr_delete_fd(long long res,
+                                                     long long fd,
+                                                     long long attrnamespace,
+                                                     long long attrname);
+void __sanitizer_syscall_pre_impl_extattr_set_link(long long path,
+                                                   long long attrnamespace,
+                                                   long long attrname,
+                                                   long long data,
+                                                   long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_set_link(
+    long long res, long long path, long long attrnamespace, long long attrname,
+    long long data, long long nbytes);
+void __sanitizer_syscall_pre_impl_extattr_get_link(long long path,
+                                                   long long attrnamespace,
+                                                   long long attrname,
+                                                   long long data,
+                                                   long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_get_link(
+    long long res, long long path, long long attrnamespace, long long attrname,
+    long long data, long long nbytes);
+void __sanitizer_syscall_pre_impl_extattr_delete_link(long long path,
+                                                      long long attrnamespace,
+                                                      long long attrname);
+void __sanitizer_syscall_post_impl_extattr_delete_link(long long res,
+                                                       long long path,
+                                                       long long attrnamespace,
+                                                       long long attrname);
+void __sanitizer_syscall_pre_impl_extattr_list_fd(long long fd,
+                                                  long long attrnamespace,
+                                                  long long data,
+                                                  long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_list_fd(long long res, long long fd,
+                                                   long long attrnamespace,
+                                                   long long data,
+                                                   long long nbytes);
+void __sanitizer_syscall_pre_impl_extattr_list_file(long long path,
+                                                    long long attrnamespace,
+                                                    long long data,
+                                                    long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_list_file(long long res,
+                                                     long long path,
+                                                     long long attrnamespace,
+                                                     long long data,
+                                                     long long nbytes);
+void __sanitizer_syscall_pre_impl_extattr_list_link(long long path,
+                                                    long long attrnamespace,
+                                                    long long data,
+                                                    long long nbytes);
+void __sanitizer_syscall_post_impl_extattr_list_link(long long res,
+                                                     long long path,
+                                                     long long attrnamespace,
+                                                     long long data,
+                                                     long long nbytes);
+void __sanitizer_syscall_pre_impl_compat_50_pselect(long long nd, long long in,
+                                                    long long ou, long long ex,
+                                                    long long ts,
+                                                    long long mask);
+void __sanitizer_syscall_post_impl_compat_50_pselect(long long res,
+                                                     long long nd, long long in,
+                                                     long long ou, long long ex,
+                                                     long long ts,
+                                                     long long mask);
+void __sanitizer_syscall_pre_impl_compat_50_pollts(long long fds,
+                                                   long long nfds, long long ts,
+                                                   long long mask);
+void __sanitizer_syscall_post_impl_compat_50_pollts(
+    long long res, long long fds, long long nfds, long long ts, long long mask);
+void __sanitizer_syscall_pre_impl_setxattr(long long path, long long name,
+                                           long long value, long long size,
+                                           long long flags);
+void __sanitizer_syscall_post_impl_setxattr(long long res, long long path,
+                                            long long name, long long value,
+                                            long long size, long long flags);
+void __sanitizer_syscall_pre_impl_lsetxattr(long long path, long long name,
+                                            long long value, long long size,
+                                            long long flags);
+void __sanitizer_syscall_post_impl_lsetxattr(long long res, long long path,
+                                             long long name, long long value,
+                                             long long size, long long flags);
+void __sanitizer_syscall_pre_impl_fsetxattr(long long fd, long long name,
+                                            long long value, long long size,
+                                            long long flags);
+void __sanitizer_syscall_post_impl_fsetxattr(long long res, long long fd,
+                                             long long name, long long value,
+                                             long long size, long long flags);
+void __sanitizer_syscall_pre_impl_getxattr(long long path, long long name,
+                                           long long value, long long size);
+void __sanitizer_syscall_post_impl_getxattr(long long res, long long path,
+                                            long long name, long long value,
+                                            long long size);
+void __sanitizer_syscall_pre_impl_lgetxattr(long long path, long long name,
+                                            long long value, long long size);
+void __sanitizer_syscall_post_impl_lgetxattr(long long res, long long path,
+                                             long long name, long long value,
+                                             long long size);
+void __sanitizer_syscall_pre_impl_fgetxattr(long long fd, long long name,
+                                            long long value, long long size);
+void __sanitizer_syscall_post_impl_fgetxattr(long long res, long long fd,
+                                             long long name, long long value,
+                                             long long size);
+void __sanitizer_syscall_pre_impl_listxattr(long long path, long long list,
+                                            long long size);
+void __sanitizer_syscall_post_impl_listxattr(long long res, long long path,
+                                             long long list, long long size);
+void __sanitizer_syscall_pre_impl_llistxattr(long long path, long long list,
+                                             long long size);
+void __sanitizer_syscall_post_impl_llistxattr(long long res, long long path,
+                                              long long list, long long size);
+void __sanitizer_syscall_pre_impl_flistxattr(long long fd, long long list,
+                                             long long size);
+void __sanitizer_syscall_post_impl_flistxattr(long long res, long long fd,
+                                              long long list, long long size);
+void __sanitizer_syscall_pre_impl_removexattr(long long path, long long name);
+void __sanitizer_syscall_post_impl_removexattr(long long res, long long path,
+                                               long long name);
+void __sanitizer_syscall_pre_impl_lremovexattr(long long path, long long name);
+void __sanitizer_syscall_post_impl_lremovexattr(long long res, long long path,
+                                                long long name);
+void __sanitizer_syscall_pre_impl_fremovexattr(long long fd, long long name);
+void __sanitizer_syscall_post_impl_fremovexattr(long long res, long long fd,
+                                                long long name);
+void __sanitizer_syscall_pre_impl_compat_50___stat30(long long path,
+                                                     long long ub);
+void __sanitizer_syscall_post_impl_compat_50___stat30(long long res,
+                                                      long long path,
+                                                      long long ub);
+void __sanitizer_syscall_pre_impl_compat_50___fstat30(long long fd,
+                                                      long long sb);
+void __sanitizer_syscall_post_impl_compat_50___fstat30(long long res,
+                                                       long long fd,
+                                                       long long sb);
+void __sanitizer_syscall_pre_impl_compat_50___lstat30(long long path,
+                                                      long long ub);
+void __sanitizer_syscall_post_impl_compat_50___lstat30(long long res,
+                                                       long long path,
+                                                       long long ub);
+void __sanitizer_syscall_pre_impl___getdents30(long long fd, long long buf,
+                                               long long count);
+void __sanitizer_syscall_post_impl___getdents30(long long res, long long fd,
+                                                long long buf, long long count);
+void __sanitizer_syscall_pre_impl_posix_fadvise(long long);
+void __sanitizer_syscall_post_impl_posix_fadvise(long long res, long long);
+void __sanitizer_syscall_pre_impl_compat_30___fhstat30(long long fhp,
+                                                       long long sb);
+void __sanitizer_syscall_post_impl_compat_30___fhstat30(long long res,
+                                                        long long fhp,
+                                                        long long sb);
+void __sanitizer_syscall_pre_impl_compat_50___ntp_gettime30(long long ntvp);
+void __sanitizer_syscall_post_impl_compat_50___ntp_gettime30(long long res,
+                                                             long long ntvp);
+void __sanitizer_syscall_pre_impl___socket30(long long domain, long long type,
+                                             long long protocol);
+void __sanitizer_syscall_post_impl___socket30(long long res, long long domain,
+                                              long long type,
+                                              long long protocol);
+void __sanitizer_syscall_pre_impl___getfh30(long long fname, long long fhp,
+                                            long long fh_size);
+void __sanitizer_syscall_post_impl___getfh30(long long res, long long fname,
+                                             long long fhp, long long fh_size);
+void __sanitizer_syscall_pre_impl___fhopen40(long long fhp, long long fh_size,
+                                             long long flags);
+void __sanitizer_syscall_post_impl___fhopen40(long long res, long long fhp,
+                                              long long fh_size,
+                                              long long flags);
+void __sanitizer_syscall_pre_impl___fhstatvfs140(long long fhp,
+                                                 long long fh_size,
+                                                 long long buf,
+                                                 long long flags);
+void __sanitizer_syscall_post_impl___fhstatvfs140(long long res, long long fhp,
+                                                  long long fh_size,
+                                                  long long buf,
+                                                  long long flags);
+void __sanitizer_syscall_pre_impl_compat_50___fhstat40(long long fhp,
+                                                       long long fh_size,
+                                                       long long sb);
+void __sanitizer_syscall_post_impl_compat_50___fhstat40(long long res,
+                                                        long long fhp,
+                                                        long long fh_size,
+                                                        long long sb);
+void __sanitizer_syscall_pre_impl_aio_cancel(long long fildes,
+                                             long long aiocbp);
+void __sanitizer_syscall_post_impl_aio_cancel(long long res, long long fildes,
+                                              long long aiocbp);
+void __sanitizer_syscall_pre_impl_aio_error(long long aiocbp);
+void __sanitizer_syscall_post_impl_aio_error(long long res, long long aiocbp);
+void __sanitizer_syscall_pre_impl_aio_fsync(long long op, long long aiocbp);
+void __sanitizer_syscall_post_impl_aio_fsync(long long res, long long op,
+                                             long long aiocbp);
+void __sanitizer_syscall_pre_impl_aio_read(long long aiocbp);
+void __sanitizer_syscall_post_impl_aio_read(long long res, long long aiocbp);
+void __sanitizer_syscall_pre_impl_aio_return(long long aiocbp);
+void __sanitizer_syscall_post_impl_aio_return(long long res, long long aiocbp);
+void __sanitizer_syscall_pre_impl_compat_50_aio_suspend(long long list,
+                                                        long long nent,
+                                                        long long timeout);
+void __sanitizer_syscall_post_impl_compat_50_aio_suspend(long long res,
+                                                         long long list,
+                                                         long long nent,
+                                                         long long timeout);
+void __sanitizer_syscall_pre_impl_aio_write(long long aiocbp);
+void __sanitizer_syscall_post_impl_aio_write(long long res, long long aiocbp);
+void __sanitizer_syscall_pre_impl_lio_listio(long long mode, long long list,
+                                             long long nent, long long sig);
+void __sanitizer_syscall_post_impl_lio_listio(long long res, long long mode,
+                                              long long list, long long nent,
+                                              long long sig);
+/* syscall 407 has been skipped */
+/* syscall 408 has been skipped */
+/* syscall 409 has been skipped */
+void __sanitizer_syscall_pre_impl___mount50(long long type, long long path,
+                                            long long flags, long long data,
+                                            long long data_len);
+void __sanitizer_syscall_post_impl___mount50(long long res, long long type,
+                                             long long path, long long flags,
+                                             long long data,
+                                             long long data_len);
+void __sanitizer_syscall_pre_impl_mremap(long long old_address,
+                                         long long old_size,
+                                         long long new_address,
+                                         long long new_size, long long flags);
+void __sanitizer_syscall_post_impl_mremap(long long res, long long old_address,
+                                          long long old_size,
+                                          long long new_address,
+                                          long long new_size, long long flags);
+void __sanitizer_syscall_pre_impl_pset_create(long long psid);
+void __sanitizer_syscall_post_impl_pset_create(long long res, long long psid);
+void __sanitizer_syscall_pre_impl_pset_destroy(long long psid);
+void __sanitizer_syscall_post_impl_pset_destroy(long long res, long long psid);
+void __sanitizer_syscall_pre_impl_pset_assign(long long psid, long long cpuid,
+                                              long long opsid);
+void __sanitizer_syscall_post_impl_pset_assign(long long res, long long psid,
+                                               long long cpuid,
+                                               long long opsid);
+void __sanitizer_syscall_pre_impl__pset_bind(long long idtype,
+                                             long long first_id,
+                                             long long second_id,
+                                             long long psid, long long opsid);
+void __sanitizer_syscall_post_impl__pset_bind(long long res, long long idtype,
+                                              long long first_id,
+                                              long long second_id,
+                                              long long psid, long long opsid);
+void __sanitizer_syscall_pre_impl___posix_fadvise50(long long fd, long long PAD,
+                                                    long long offset,
+                                                    long long len,
+                                                    long long advice);
+void __sanitizer_syscall_post_impl___posix_fadvise50(
+    long long res, long long fd, long long PAD, long long offset, long long len,
+    long long advice);
+void __sanitizer_syscall_pre_impl___select50(long long nd, long long in,
+                                             long long ou, long long ex,
+                                             long long tv);
+void __sanitizer_syscall_post_impl___select50(long long res, long long nd,
+                                              long long in, long long ou,
+                                              long long ex, long long tv);
+void __sanitizer_syscall_pre_impl___gettimeofday50(long long tp, long long tzp);
+void __sanitizer_syscall_post_impl___gettimeofday50(long long res, long long tp,
+                                                    long long tzp);
+void __sanitizer_syscall_pre_impl___settimeofday50(long long tv, long long tzp);
+void __sanitizer_syscall_post_impl___settimeofday50(long long res, long long tv,
+                                                    long long tzp);
+void __sanitizer_syscall_pre_impl___utimes50(long long path, long long tptr);
+void __sanitizer_syscall_post_impl___utimes50(long long res, long long path,
+                                              long long tptr);
+void __sanitizer_syscall_pre_impl___adjtime50(long long delta,
+                                              long long olddelta);
+void __sanitizer_syscall_post_impl___adjtime50(long long res, long long delta,
+                                               long long olddelta);
+void __sanitizer_syscall_pre_impl___lfs_segwait50(long long fsidp,
+                                                  long long tv);
+void __sanitizer_syscall_post_impl___lfs_segwait50(long long res,
+                                                   long long fsidp,
+                                                   long long tv);
+void __sanitizer_syscall_pre_impl___futimes50(long long fd, long long tptr);
+void __sanitizer_syscall_post_impl___futimes50(long long res, long long fd,
+                                               long long tptr);
+void __sanitizer_syscall_pre_impl___lutimes50(long long path, long long tptr);
+void __sanitizer_syscall_post_impl___lutimes50(long long res, long long path,
+                                               long long tptr);
+void __sanitizer_syscall_pre_impl___setitimer50(long long which, long long itv,
+                                                long long oitv);
+void __sanitizer_syscall_post_impl___setitimer50(long long res, long long which,
+                                                 long long itv, long long oitv);
+void __sanitizer_syscall_pre_impl___getitimer50(long long which, long long itv);
+void __sanitizer_syscall_post_impl___getitimer50(long long res, long long which,
+                                                 long long itv);
+void __sanitizer_syscall_pre_impl___clock_gettime50(long long clock_id,
+                                                    long long tp);
+void __sanitizer_syscall_post_impl___clock_gettime50(long long res,
+                                                     long long clock_id,
+                                                     long long tp);
+void __sanitizer_syscall_pre_impl___clock_settime50(long long clock_id,
+                                                    long long tp);
+void __sanitizer_syscall_post_impl___clock_settime50(long long res,
+                                                     long long clock_id,
+                                                     long long tp);
+void __sanitizer_syscall_pre_impl___clock_getres50(long long clock_id,
+                                                   long long tp);
+void __sanitizer_syscall_post_impl___clock_getres50(long long res,
+                                                    long long clock_id,
+                                                    long long tp);
+void __sanitizer_syscall_pre_impl___nanosleep50(long long rqtp, long long rmtp);
+void __sanitizer_syscall_post_impl___nanosleep50(long long res, long long rqtp,
+                                                 long long rmtp);
+void __sanitizer_syscall_pre_impl_____sigtimedwait50(long long set,
+                                                     long long info,
+                                                     long long timeout);
+void __sanitizer_syscall_post_impl_____sigtimedwait50(long long res,
+                                                      long long set,
+                                                      long long info,
+                                                      long long timeout);
+void __sanitizer_syscall_pre_impl___mq_timedsend50(long long mqdes,
+                                                   long long msg_ptr,
+                                                   long long msg_len,
+                                                   long long msg_prio,
+                                                   long long abs_timeout);
+void __sanitizer_syscall_post_impl___mq_timedsend50(
+    long long res, long long mqdes, long long msg_ptr, long long msg_len,
+    long long msg_prio, long long abs_timeout);
+void __sanitizer_syscall_pre_impl___mq_timedreceive50(long long mqdes,
+                                                      long long msg_ptr,
+                                                      long long msg_len,
+                                                      long long msg_prio,
+                                                      long long abs_timeout);
+void __sanitizer_syscall_post_impl___mq_timedreceive50(
+    long long res, long long mqdes, long long msg_ptr, long long msg_len,
+    long long msg_prio, long long abs_timeout);
+void __sanitizer_syscall_pre_impl_compat_60__lwp_park(long long ts,
+                                                      long long unpark,
+                                                      long long hint,
+                                                      long long unparkhint);
+void __sanitizer_syscall_post_impl_compat_60__lwp_park(long long res,
+                                                       long long ts,
+                                                       long long unpark,
+                                                       long long hint,
+                                                       long long unparkhint);
+void __sanitizer_syscall_pre_impl___kevent50(long long fd, long long changelist,
+                                             long long nchanges,
+                                             long long eventlist,
+                                             long long nevents,
+                                             long long timeout);
+void __sanitizer_syscall_post_impl___kevent50(
+    long long res, long long fd, long long changelist, long long nchanges,
+    long long eventlist, long long nevents, long long timeout);
+void __sanitizer_syscall_pre_impl___pselect50(long long nd, long long in,
+                                              long long ou, long long ex,
+                                              long long ts, long long mask);
+void __sanitizer_syscall_post_impl___pselect50(long long res, long long nd,
+                                               long long in, long long ou,
+                                               long long ex, long long ts,
+                                               long long mask);
+void __sanitizer_syscall_pre_impl___pollts50(long long fds, long long nfds,
+                                             long long ts, long long mask);
+void __sanitizer_syscall_post_impl___pollts50(long long res, long long fds,
+                                              long long nfds, long long ts,
+                                              long long mask);
+void __sanitizer_syscall_pre_impl___aio_suspend50(long long list,
+                                                  long long nent,
+                                                  long long timeout);
+void __sanitizer_syscall_post_impl___aio_suspend50(long long res,
+                                                   long long list,
+                                                   long long nent,
+                                                   long long timeout);
+void __sanitizer_syscall_pre_impl___stat50(long long path, long long ub);
+void __sanitizer_syscall_post_impl___stat50(long long res, long long path,
+                                            long long ub);
+void __sanitizer_syscall_pre_impl___fstat50(long long fd, long long sb);
+void __sanitizer_syscall_post_impl___fstat50(long long res, long long fd,
+                                             long long sb);
+void __sanitizer_syscall_pre_impl___lstat50(long long path, long long ub);
+void __sanitizer_syscall_post_impl___lstat50(long long res, long long path,
+                                             long long ub);
+void __sanitizer_syscall_pre_impl_____semctl50(long long semid,
+                                               long long semnum, long long cmd,
+                                               long long arg);
+void __sanitizer_syscall_post_impl_____semctl50(long long res, long long semid,
+                                                long long semnum, long long cmd,
+                                                long long arg);
+void __sanitizer_syscall_pre_impl___shmctl50(long long shmid, long long cmd,
+                                             long long buf);
+void __sanitizer_syscall_post_impl___shmctl50(long long res, long long shmid,
+                                              long long cmd, long long buf);
+void __sanitizer_syscall_pre_impl___msgctl50(long long msqid, long long cmd,
+                                             long long buf);
+void __sanitizer_syscall_post_impl___msgctl50(long long res, long long msqid,
+                                              long long cmd, long long buf);
+void __sanitizer_syscall_pre_impl___getrusage50(long long who,
+                                                long long rusage);
+void __sanitizer_syscall_post_impl___getrusage50(long long res, long long who,
+                                                 long long rusage);
+void __sanitizer_syscall_pre_impl___timer_settime50(long long timerid,
+                                                    long long flags,
+                                                    long long value,
+                                                    long long ovalue);
+void __sanitizer_syscall_post_impl___timer_settime50(long long res,
+                                                     long long timerid,
+                                                     long long flags,
+                                                     long long value,
+                                                     long long ovalue);
+void __sanitizer_syscall_pre_impl___timer_gettime50(long long timerid,
+                                                    long long value);
+void __sanitizer_syscall_post_impl___timer_gettime50(long long res,
+                                                     long long timerid,
+                                                     long long value);
+#if defined(NTP) || !defined(_KERNEL_OPT)
+void __sanitizer_syscall_pre_impl___ntp_gettime50(long long ntvp);
+void __sanitizer_syscall_post_impl___ntp_gettime50(long long res,
+                                                   long long ntvp);
+#else
+/* syscall 448 has been skipped */
+#endif
+void __sanitizer_syscall_pre_impl___wait450(long long pid, long long status,
+                                            long long options,
+                                            long long rusage);
+void __sanitizer_syscall_post_impl___wait450(long long res, long long pid,
+                                             long long status,
+                                             long long options,
+                                             long long rusage);
+void __sanitizer_syscall_pre_impl___mknod50(long long path, long long mode,
+                                            long long dev);
+void __sanitizer_syscall_post_impl___mknod50(long long res, long long path,
+                                             long long mode, long long dev);
+void __sanitizer_syscall_pre_impl___fhstat50(long long fhp, long long fh_size,
+                                             long long sb);
+void __sanitizer_syscall_post_impl___fhstat50(long long res, long long fhp,
+                                              long long fh_size, long long sb);
+/* syscall 452 has been skipped */
+void __sanitizer_syscall_pre_impl_pipe2(long long fildes, long long flags);
+void __sanitizer_syscall_post_impl_pipe2(long long res, long long fildes,
+                                         long long flags);
+void __sanitizer_syscall_pre_impl_dup3(long long from, long long to,
+                                       long long flags);
+void __sanitizer_syscall_post_impl_dup3(long long res, long long from,
+                                        long long to, long long flags);
+void __sanitizer_syscall_pre_impl_kqueue1(long long flags);
+void __sanitizer_syscall_post_impl_kqueue1(long long res, long long flags);
+void __sanitizer_syscall_pre_impl_paccept(long long s, long long name,
+                                          long long anamelen, long long mask,
+                                          long long flags);
+void __sanitizer_syscall_post_impl_paccept(long long res, long long s,
+                                           long long name, long long anamelen,
+                                           long long mask, long long flags);
+void __sanitizer_syscall_pre_impl_linkat(long long fd1, long long name1,
+                                         long long fd2, long long name2,
+                                         long long flags);
+void __sanitizer_syscall_post_impl_linkat(long long res, long long fd1,
+                                          long long name1, long long fd2,
+                                          long long name2, long long flags);
+void __sanitizer_syscall_pre_impl_renameat(long long fromfd, long long from,
+                                           long long tofd, long long to);
+void __sanitizer_syscall_post_impl_renameat(long long res, long long fromfd,
+                                            long long from, long long tofd,
+                                            long long to);
+void __sanitizer_syscall_pre_impl_mkfifoat(long long fd, long long path,
+                                           long long mode);
+void __sanitizer_syscall_post_impl_mkfifoat(long long res, long long fd,
+                                            long long path, long long mode);
+void __sanitizer_syscall_pre_impl_mknodat(long long fd, long long path,
+                                          long long mode, long long PAD,
+                                          long long dev);
+void __sanitizer_syscall_post_impl_mknodat(long long res, long long fd,
+                                           long long path, long long mode,
+                                           long long PAD, long long dev);
+void __sanitizer_syscall_pre_impl_mkdirat(long long fd, long long path,
+                                          long long mode);
+void __sanitizer_syscall_post_impl_mkdirat(long long res, long long fd,
+                                           long long path, long long mode);
+void __sanitizer_syscall_pre_impl_faccessat(long long fd, long long path,
+                                            long long amode, long long flag);
+void __sanitizer_syscall_post_impl_faccessat(long long res, long long fd,
+                                             long long path, long long amode,
+                                             long long flag);
+void __sanitizer_syscall_pre_impl_fchmodat(long long fd, long long path,
+                                           long long mode, long long flag);
+void __sanitizer_syscall_post_impl_fchmodat(long long res, long long fd,
+                                            long long path, long long mode,
+                                            long long flag);
+void __sanitizer_syscall_pre_impl_fchownat(long long fd, long long path,
+                                           long long owner, long long group,
+                                           long long flag);
+void __sanitizer_syscall_post_impl_fchownat(long long res, long long fd,
+                                            long long path, long long owner,
+                                            long long group, long long flag);
+void __sanitizer_syscall_pre_impl_fexecve(long long fd, long long argp,
+                                          long long envp);
+void __sanitizer_syscall_post_impl_fexecve(long long res, long long fd,
+                                           long long argp, long long envp);
+void __sanitizer_syscall_pre_impl_fstatat(long long fd, long long path,
+                                          long long buf, long long flag);
+void __sanitizer_syscall_post_impl_fstatat(long long res, long long fd,
+                                           long long path, long long buf,
+                                           long long flag);
+void __sanitizer_syscall_pre_impl_utimensat(long long fd, long long path,
+                                            long long tptr, long long flag);
+void __sanitizer_syscall_post_impl_utimensat(long long res, long long fd,
+                                             long long path, long long tptr,
+                                             long long flag);
+void __sanitizer_syscall_pre_impl_openat(long long fd, long long path,
+                                         long long oflags, long long mode);
+void __sanitizer_syscall_post_impl_openat(long long res, long long fd,
+                                          long long path, long long oflags,
+                                          long long mode);
+void __sanitizer_syscall_pre_impl_readlinkat(long long fd, long long path,
+                                             long long buf, long long bufsize);
+void __sanitizer_syscall_post_impl_readlinkat(long long res, long long fd,
+                                              long long path, long long buf,
+                                              long long bufsize);
+void __sanitizer_syscall_pre_impl_symlinkat(long long path1, long long fd,
+                                            long long path2);
+void __sanitizer_syscall_post_impl_symlinkat(long long res, long long path1,
+                                             long long fd, long long path2);
+void __sanitizer_syscall_pre_impl_unlinkat(long long fd, long long path,
+                                           long long flag);
+void __sanitizer_syscall_post_impl_unlinkat(long long res, long long fd,
+                                            long long path, long long flag);
+void __sanitizer_syscall_pre_impl_futimens(long long fd, long long tptr);
+void __sanitizer_syscall_post_impl_futimens(long long res, long long fd,
+                                            long long tptr);
+void __sanitizer_syscall_pre_impl___quotactl(long long path, long long args);
+void __sanitizer_syscall_post_impl___quotactl(long long res, long long path,
+                                              long long args);
+void __sanitizer_syscall_pre_impl_posix_spawn(long long pid, long long path,
+                                              long long file_actions,
+                                              long long attrp, long long argv,
+                                              long long envp);
+void __sanitizer_syscall_post_impl_posix_spawn(long long res, long long pid,
+                                               long long path,
+                                               long long file_actions,
+                                               long long attrp, long long argv,
+                                               long long envp);
+void __sanitizer_syscall_pre_impl_recvmmsg(long long s, long long mmsg,
+                                           long long vlen, long long flags,
+                                           long long timeout);
+void __sanitizer_syscall_post_impl_recvmmsg(long long res, long long s,
+                                            long long mmsg, long long vlen,
+                                            long long flags, long long timeout);
+void __sanitizer_syscall_pre_impl_sendmmsg(long long s, long long mmsg,
+                                           long long vlen, long long flags);
+void __sanitizer_syscall_post_impl_sendmmsg(long long res, long long s,
+                                            long long mmsg, long long vlen,
+                                            long long flags);
+void __sanitizer_syscall_pre_impl_clock_nanosleep(long long clock_id,
+                                                  long long flags,
+                                                  long long rqtp,
+                                                  long long rmtp);
+void __sanitizer_syscall_post_impl_clock_nanosleep(long long res,
+                                                   long long clock_id,
+                                                   long long flags,
+                                                   long long rqtp,
+                                                   long long rmtp);
+void __sanitizer_syscall_pre_impl____lwp_park60(long long clock_id,
+                                                long long flags, long long ts,
+                                                long long unpark,
+                                                long long hint,
+                                                long long unparkhint);
+void __sanitizer_syscall_post_impl____lwp_park60(
+    long long res, long long clock_id, long long flags, long long ts,
+    long long unpark, long long hint, long long unparkhint);
+void __sanitizer_syscall_pre_impl_posix_fallocate(long long fd, long long PAD,
+                                                  long long pos, long long len);
+void __sanitizer_syscall_post_impl_posix_fallocate(long long res, long long fd,
+                                                   long long PAD, long long pos,
+                                                   long long len);
+void __sanitizer_syscall_pre_impl_fdiscard(long long fd, long long PAD,
+                                           long long pos, long long len);
+void __sanitizer_syscall_post_impl_fdiscard(long long res, long long fd,
+                                            long long PAD, long long pos,
+                                            long long len);
+void __sanitizer_syscall_pre_impl_wait6(long long idtype, long long id,
+                                        long long status, long long options,
+                                        long long wru, long long info);
+void __sanitizer_syscall_post_impl_wait6(long long res, long long idtype,
+                                         long long id, long long status,
+                                         long long options, long long wru,
+                                         long long info);
+void __sanitizer_syscall_pre_impl_clock_getcpuclockid2(long long idtype,
+                                                       long long id,
+                                                       long long clock_id);
+void __sanitizer_syscall_post_impl_clock_getcpuclockid2(long long res,
+                                                        long long idtype,
+                                                        long long id,
+                                                        long long clock_id);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+// DO NOT EDIT! THIS FILE HAS BEEN GENERATED!
+
+#endif // SANITIZER_NETBSD_SYSCALL_HOOKS_H
diff --git a/darwin-x86/clang-headers/sanitizer/scudo_interface.h b/darwin-x86/clang-headers/sanitizer/scudo_interface.h
new file mode 100644
index 0000000..be605f1
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/scudo_interface.h
@@ -0,0 +1,39 @@
+//===-- sanitizer/scudo_interface.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Public Scudo interface header.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_SCUDO_INTERFACE_H_
+#define SANITIZER_SCUDO_INTERFACE_H_
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // This function may be optionally provided by a user and should return
+  // a string containing Scudo runtime options. See scudo_flags.h for details.
+  const char* __scudo_default_options(void);
+
+  // This function allows to set the RSS limit at runtime. This can be either
+  // the hard limit (HardLimit=1) or the soft limit (HardLimit=0). The limit
+  // can be removed by setting LimitMb to 0. This function's parameters should
+  // be fully trusted to avoid security mishaps.
+  void __scudo_set_rss_limit(size_t LimitMb, int HardLimit);
+
+  // This function outputs various allocator statistics for both the Primary
+  // and Secondary allocators, including memory usage, number of allocations
+  // and deallocations.
+  void __scudo_print_stats(void);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_SCUDO_INTERFACE_H_
diff --git a/darwin-x86/clang-headers/sanitizer/tsan_interface.h b/darwin-x86/clang-headers/sanitizer/tsan_interface.h
new file mode 100644
index 0000000..5308850
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/tsan_interface.h
@@ -0,0 +1,144 @@
+//===-- tsan_interface.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+// Public interface header for TSan.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_TSAN_INTERFACE_H
+#define SANITIZER_TSAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// __tsan_release establishes a happens-before relation with a preceding
+// __tsan_acquire on the same address.
+void __tsan_acquire(void *addr);
+void __tsan_release(void *addr);
+
+// Annotations for custom mutexes.
+// The annotations allow to get better reports (with sets of locked mutexes),
+// detect more types of bugs (e.g. mutex misuses, races between lock/unlock and
+// destruction and potential deadlocks) and improve precision and performance
+// (by ignoring individual atomic operations in mutex code). However, the
+// downside is that annotated mutex code itself is not checked for correctness.
+
+// Mutex creation flags are passed to __tsan_mutex_create annotation.
+// If mutex has no constructor and __tsan_mutex_create is not called,
+// the flags may be passed to __tsan_mutex_pre_lock/__tsan_mutex_post_lock
+// annotations.
+
+// Mutex has static storage duration and no-op constructor and destructor.
+// This effectively makes tsan ignore destroy annotation.
+const unsigned __tsan_mutex_linker_init      = 1 << 0;
+// Mutex is write reentrant.
+const unsigned __tsan_mutex_write_reentrant  = 1 << 1;
+// Mutex is read reentrant.
+const unsigned __tsan_mutex_read_reentrant   = 1 << 2;
+// Mutex does not have static storage duration, and must not be used after
+// its destructor runs.  The opposite of __tsan_mutex_linker_init.
+// If this flag is passed to __tsan_mutex_destroy, then the destruction
+// is ignored unless this flag was previously set on the mutex.
+const unsigned __tsan_mutex_not_static       = 1 << 8;
+
+// Mutex operation flags:
+
+// Denotes read lock operation.
+const unsigned __tsan_mutex_read_lock        = 1 << 3;
+// Denotes try lock operation.
+const unsigned __tsan_mutex_try_lock         = 1 << 4;
+// Denotes that a try lock operation has failed to acquire the mutex.
+const unsigned __tsan_mutex_try_lock_failed  = 1 << 5;
+// Denotes that the lock operation acquires multiple recursion levels.
+// Number of levels is passed in recursion parameter.
+// This is useful for annotation of e.g. Java builtin monitors,
+// for which wait operation releases all recursive acquisitions of the mutex.
+const unsigned __tsan_mutex_recursive_lock   = 1 << 6;
+// Denotes that the unlock operation releases all recursion levels.
+// Number of released levels is returned and later must be passed to
+// the corresponding __tsan_mutex_post_lock annotation.
+const unsigned __tsan_mutex_recursive_unlock = 1 << 7;
+
+// Annotate creation of a mutex.
+// Supported flags: mutex creation flags.
+void __tsan_mutex_create(void *addr, unsigned flags);
+
+// Annotate destruction of a mutex.
+// Supported flags:
+//   - __tsan_mutex_linker_init
+//   - __tsan_mutex_not_static
+void __tsan_mutex_destroy(void *addr, unsigned flags);
+
+// Annotate start of lock operation.
+// Supported flags:
+//   - __tsan_mutex_read_lock
+//   - __tsan_mutex_try_lock
+//   - all mutex creation flags
+void __tsan_mutex_pre_lock(void *addr, unsigned flags);
+
+// Annotate end of lock operation.
+// Supported flags:
+//   - __tsan_mutex_read_lock (must match __tsan_mutex_pre_lock)
+//   - __tsan_mutex_try_lock (must match __tsan_mutex_pre_lock)
+//   - __tsan_mutex_try_lock_failed
+//   - __tsan_mutex_recursive_lock
+//   - all mutex creation flags
+void __tsan_mutex_post_lock(void *addr, unsigned flags, int recursion);
+
+// Annotate start of unlock operation.
+// Supported flags:
+//   - __tsan_mutex_read_lock
+//   - __tsan_mutex_recursive_unlock
+int __tsan_mutex_pre_unlock(void *addr, unsigned flags);
+
+// Annotate end of unlock operation.
+// Supported flags:
+//   - __tsan_mutex_read_lock (must match __tsan_mutex_pre_unlock)
+void __tsan_mutex_post_unlock(void *addr, unsigned flags);
+
+// Annotate start/end of notify/signal/broadcast operation.
+// Supported flags: none.
+void __tsan_mutex_pre_signal(void *addr, unsigned flags);
+void __tsan_mutex_post_signal(void *addr, unsigned flags);
+
+// Annotate start/end of a region of code where lock/unlock/signal operation
+// diverts to do something else unrelated to the mutex. This can be used to
+// annotate, for example, calls into cooperative scheduler or contention
+// profiling code.
+// These annotations must be called only from within
+// __tsan_mutex_pre/post_lock, __tsan_mutex_pre/post_unlock,
+// __tsan_mutex_pre/post_signal regions.
+// Supported flags: none.
+void __tsan_mutex_pre_divert(void *addr, unsigned flags);
+void __tsan_mutex_post_divert(void *addr, unsigned flags);
+
+// External race detection API.
+// Can be used by non-instrumented libraries to detect when their objects are
+// being used in an unsafe manner.
+//   - __tsan_external_read/__tsan_external_write annotates the logical reads
+//       and writes of the object at the specified address. 'caller_pc' should
+//       be the PC of the library user, which the library can obtain with e.g.
+//       `__builtin_return_address(0)`.
+//   - __tsan_external_register_tag registers a 'tag' with the specified name,
+//       which is later used in read/write annotations to denote the object type
+//   - __tsan_external_assign_tag can optionally mark a heap object with a tag
+void *__tsan_external_register_tag(const char *object_type);
+void __tsan_external_register_header(void *tag, const char *header);
+void __tsan_external_assign_tag(void *addr, void *tag);
+void __tsan_external_read(void *addr, void *caller_pc, void *tag);
+void __tsan_external_write(void *addr, void *caller_pc, void *tag);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_TSAN_INTERFACE_H
diff --git a/darwin-x86/clang-headers/sanitizer/tsan_interface_atomic.h b/darwin-x86/clang-headers/sanitizer/tsan_interface_atomic.h
new file mode 100644
index 0000000..4ea77d2
--- /dev/null
+++ b/darwin-x86/clang-headers/sanitizer/tsan_interface_atomic.h
@@ -0,0 +1,222 @@
+//===-- tsan_interface_atomic.h ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+// Public interface header for TSan atomics.
+//===----------------------------------------------------------------------===//
+#ifndef TSAN_INTERFACE_ATOMIC_H
+#define TSAN_INTERFACE_ATOMIC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef char     __tsan_atomic8;
+typedef short    __tsan_atomic16;  // NOLINT
+typedef int      __tsan_atomic32;
+typedef long     __tsan_atomic64;  // NOLINT
+#if defined(__SIZEOF_INT128__) \
+    || (__clang_major__ * 100 + __clang_minor__ >= 302)
+__extension__ typedef __int128 __tsan_atomic128;
+# define __TSAN_HAS_INT128 1
+#else
+# define __TSAN_HAS_INT128 0
+#endif
+
+// Part of ABI, do not change.
+// http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/atomic?view=markup
+typedef enum {
+  __tsan_memory_order_relaxed,
+  __tsan_memory_order_consume,
+  __tsan_memory_order_acquire,
+  __tsan_memory_order_release,
+  __tsan_memory_order_acq_rel,
+  __tsan_memory_order_seq_cst
+} __tsan_memory_order;
+
+__tsan_atomic8 __tsan_atomic8_load(const volatile __tsan_atomic8 *a,
+    __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_load(const volatile __tsan_atomic16 *a,
+    __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_load(const volatile __tsan_atomic32 *a,
+    __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_load(const volatile __tsan_atomic64 *a,
+    __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_load(const volatile __tsan_atomic128 *a,
+    __tsan_memory_order mo);
+#endif
+
+void __tsan_atomic8_store(volatile __tsan_atomic8 *a, __tsan_atomic8 v,
+    __tsan_memory_order mo);
+void __tsan_atomic16_store(volatile __tsan_atomic16 *a, __tsan_atomic16 v,
+    __tsan_memory_order mo);
+void __tsan_atomic32_store(volatile __tsan_atomic32 *a, __tsan_atomic32 v,
+    __tsan_memory_order mo);
+void __tsan_atomic64_store(volatile __tsan_atomic64 *a, __tsan_atomic64 v,
+    __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+void __tsan_atomic128_store(volatile __tsan_atomic128 *a, __tsan_atomic128 v,
+    __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_exchange(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_exchange(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_exchange(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_exchange(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_exchange(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_add(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_add(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_add(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_add(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_add(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_sub(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_sub(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_sub(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_sub(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_sub(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_and(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_and(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_and(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_and(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_and(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_or(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_or(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_or(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_or(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_or(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_xor(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_xor(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_xor(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_xor(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_xor(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_nand(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_nand(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_nand(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_nand(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_nand(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+int __tsan_atomic8_compare_exchange_weak(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 *c, __tsan_atomic8 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic16_compare_exchange_weak(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 *c, __tsan_atomic16 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic32_compare_exchange_weak(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 *c, __tsan_atomic32 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic64_compare_exchange_weak(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 *c, __tsan_atomic64 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+#if __TSAN_HAS_INT128
+int __tsan_atomic128_compare_exchange_weak(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 *c, __tsan_atomic128 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+#endif
+
+int __tsan_atomic8_compare_exchange_strong(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 *c, __tsan_atomic8 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic16_compare_exchange_strong(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 *c, __tsan_atomic16 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic32_compare_exchange_strong(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 *c, __tsan_atomic32 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic64_compare_exchange_strong(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 *c, __tsan_atomic64 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+#if __TSAN_HAS_INT128
+int __tsan_atomic128_compare_exchange_strong(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 *c, __tsan_atomic128 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_compare_exchange_val(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 c, __tsan_atomic8 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+__tsan_atomic16 __tsan_atomic16_compare_exchange_val(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 c, __tsan_atomic16 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+__tsan_atomic32 __tsan_atomic32_compare_exchange_val(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 c, __tsan_atomic32 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+__tsan_atomic64 __tsan_atomic64_compare_exchange_val(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 c, __tsan_atomic64 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_compare_exchange_val(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 c, __tsan_atomic128 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+#endif
+
+void __tsan_atomic_thread_fence(__tsan_memory_order mo);
+void __tsan_atomic_signal_fence(__tsan_memory_order mo);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TSAN_INTERFACE_ATOMIC_H
diff --git a/darwin-x86/clang-headers/sgxintrin.h b/darwin-x86/clang-headers/sgxintrin.h
new file mode 100644
index 0000000..20aee76
--- /dev/null
+++ b/darwin-x86/clang-headers/sgxintrin.h
@@ -0,0 +1,70 @@
+/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <sgxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __SGXINTRIN_H
+#define __SGXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("sgx")))
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("enclu"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("encls"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("enclv"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/darwin-x86/clang-headers/shaintrin.h b/darwin-x86/clang-headers/shaintrin.h
index 9b5d218..3df4718 100644
--- a/darwin-x86/clang-headers/shaintrin.h
+++ b/darwin-x86/clang-headers/shaintrin.h
@@ -29,10 +29,10 @@
 #define __SHAINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128)))
 
-#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
-  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
+#define _mm_sha1rnds4_epu32(V1, V2, M) \
+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
diff --git a/darwin-x86/clang-headers/smmintrin.h b/darwin-x86/clang-headers/smmintrin.h
index e48ab03..4806b3e 100644
--- a/darwin-x86/clang-headers/smmintrin.h
+++ b/darwin-x86/clang-headers/smmintrin.h
@@ -21,13 +21,13 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _SMMINTRIN_H
-#define _SMMINTRIN_H
+#ifndef __SMMINTRIN_H
+#define __SMMINTRIN_H
 
 #include <tmmintrin.h>
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
 
 /* SSE4 Rounding macros. */
 #define _MM_FROUND_TO_NEAREST_INT    0x00
@@ -46,44 +46,401 @@
 #define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
 
+/// Rounds up each element of the 128-bit vector of [4 x float] to an
+///    integer and returns the rounded values in a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_ceil_ps(__m128 X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values to be rounded up.
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+
+/// Rounds up each element of the 128-bit vector of [2 x double] to an
+///    integer and returns the rounded values in a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_ceil_pd(__m128d X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values to be rounded up.
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+
+/// Copies three upper elements of the first 128-bit vector operand to
+///    the corresponding three upper elements of the 128-bit result vector of
+///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
+///    operand to an integer and copies it to the lowest element of the 128-bit
+///    result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+///    rounded up to the nearest integer and copied to the corresponding bits
+///    of the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
 #define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+
+/// Copies the upper element of the first 128-bit vector operand to the
+///    corresponding upper element of the 128-bit result vector of [2 x double].
+///    Rounds up the lower element of the second 128-bit vector operand to an
+///    integer and copies it to the lower element of the 128-bit result vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+///    rounded up to the nearest integer and copied to the corresponding bits
+///    of the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
 #define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
 
+/// Rounds down each element of the 128-bit vector of [4 x float] to an
+///    an integer and returns the rounded values in a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_floor_ps(__m128 X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values to be rounded down.
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+
+/// Rounds down each element of the 128-bit vector of [2 x double] to an
+///    integer and returns the rounded values in a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_floor_pd(__m128d X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+
+/// Copies three upper elements of the first 128-bit vector operand to
+///    the corresponding three upper elements of the 128-bit result vector of
+///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
+///    operand to an integer and copies it to the lowest element of the 128-bit
+///    result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+///    rounded down to the nearest integer and copied to the corresponding bits
+///    of the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
 #define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+
+/// Copies the upper element of the first 128-bit vector operand to the
+///    corresponding upper element of the 128-bit result vector of [2 x double].
+///    Rounds down the lower element of the second 128-bit vector operand to an
+///    integer and copies it to the lower element of the 128-bit result vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+///    rounded down to the nearest integer and copied to the corresponding bits
+///    of the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
 #define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
 
-#define _mm_round_ps(X, M) __extension__ ({ \
-  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
+/// Rounds each element of the 128-bit vector of [4 x float] to an
+///    integer value according to the rounding control specified by the second
+///    argument and returns the rounded values in a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_round_ps(__m128 X, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float].
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used \n
+///      1: The PE field is not updated \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M \n
+///      1: Use the current MXCSR setting \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest \n
+///      01: Downward (toward negative infinity) \n
+///      10: Upward (toward positive infinity) \n
+///      11: Truncated
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
+#define _mm_round_ps(X, M) \
+  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
 
-#define _mm_round_ss(X, Y, M) __extension__ ({ \
+/// Copies three upper elements of the first 128-bit vector operand to
+///    the corresponding three upper elements of the 128-bit result vector of
+///    [4 x float]. Rounds the lowest element of the second 128-bit vector
+///    operand to an integer value according to the rounding control specified
+///    by the third argument and copies it to the lowest element of the 128-bit
+///    result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+///    rounded to the nearest integer using the specified rounding control and
+///    copied to the corresponding bits of the result.
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used \n
+///      1: The PE field is not updated \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M \n
+///      1: Use the current MXCSR setting \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest \n
+///      01: Downward (toward negative infinity) \n
+///      10: Upward (toward positive infinity) \n
+///      11: Truncated
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
+#define _mm_round_ss(X, Y, M) \
   (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
-                                 (__v4sf)(__m128)(Y), (M)); })
+                                 (__v4sf)(__m128)(Y), (M))
 
-#define _mm_round_pd(X, M) __extension__ ({ \
-  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
+/// Rounds each element of the 128-bit vector of [2 x double] to an
+///    integer value according to the rounding control specified by the second
+///    argument and returns the rounded values in a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_round_pd(__m128d X, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double].
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used \n
+///      1: The PE field is not updated \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M \n
+///      1: Use the current MXCSR setting \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest \n
+///      01: Downward (toward negative infinity) \n
+///      10: Upward (toward positive infinity) \n
+///      11: Truncated
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
+#define _mm_round_pd(X, M) \
+  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
 
-#define _mm_round_sd(X, Y, M) __extension__ ({ \
+/// Copies the upper element of the first 128-bit vector operand to the
+///    corresponding upper element of the 128-bit result vector of [2 x double].
+///    Rounds the lower element of the second 128-bit vector operand to an
+///    integer value according to the rounding control specified by the third
+///    argument and copies it to the lower element of the 128-bit result vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+///    rounded to the nearest integer using the specified rounding control and
+///    copied to the corresponding bits of the result.
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used \n
+///      1: The PE field is not updated \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M \n
+///      1: Use the current MXCSR setting \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest \n
+///      01: Downward (toward negative infinity) \n
+///      10: Upward (toward positive infinity) \n
+///      11: Truncated
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
+#define _mm_round_sd(X, Y, M) \
   (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
-                                  (__v2df)(__m128d)(Y), (M)); })
+                                  (__v2df)(__m128d)(Y), (M))
 
 /* SSE4 Packed Blending Intrinsics.  */
-#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
-  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
-                                   (__v2df)(__m128d)(V2), \
-                                   (((M) & 0x01) ? 2 : 0), \
-                                   (((M) & 0x02) ? 3 : 1)); })
+/// Returns a 128-bit vector of [2 x double] where the values are
+///    selected from either the first or second operand as specified by the
+///    third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param V1
+///    A 128-bit vector of [2 x double].
+/// \param V2
+///    A 128-bit vector of [2 x double].
+/// \param M
+///    An immediate integer operand, with mask bits [1:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
+///    element in operand \a V1 is copied to the same position in the result.
+///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
+///    is copied to the same position in the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+#define _mm_blend_pd(V1, V2, M) \
+  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+                                    (__v2df)(__m128d)(V2), (int)(M))
 
-#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
-  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
-                                  (((M) & 0x01) ? 4 : 0), \
-                                  (((M) & 0x02) ? 5 : 1), \
-                                  (((M) & 0x04) ? 6 : 2), \
-                                  (((M) & 0x08) ? 7 : 3)); })
+/// Returns a 128-bit vector of [4 x float] where the values are selected
+///    from either the first or second operand as specified by the third
+///    operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
+///
+/// \param V1
+///    A 128-bit vector of [4 x float].
+/// \param V2
+///    A 128-bit vector of [4 x float].
+/// \param M
+///    An immediate integer operand, with mask bits [3:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
+///    element in operand \a V1 is copied to the same position in the result.
+///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
+///    is copied to the same position in the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+#define _mm_blend_ps(V1, V2, M) \
+  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+                                   (__v4sf)(__m128)(V2), (int)(M))
 
+/// Returns a 128-bit vector of [2 x double] where the values are
+///    selected from either the first or second operand as specified by the
+///    third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [2 x double].
+/// \param __V2
+///    A 128-bit vector of [2 x double].
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 64-bit element in operand \a __V1 is copied to the same
+///    position in the result. When a mask bit is 1, the corresponding 64-bit
+///    element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
 {
@@ -91,6 +448,26 @@
                                             (__v2df)__M);
 }
 
+/// Returns a 128-bit vector of [4 x float] where the values are
+///    selected from either the first or second operand as specified by the
+///    third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x float].
+/// \param __V2
+///    A 128-bit vector of [4 x float].
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
+///    how the values are to be copied. The position of the mask bit corresponds
+///    to the most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 32-bit element in operand \a __V1 is copied to the same
+///    position in the result. When a mask bit is 1, the corresponding 32-bit
+///    element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
 {
@@ -98,6 +475,26 @@
                                            (__v4sf)__M);
 }
 
+/// Returns a 128-bit vector of [16 x i8] where the values are selected
+///    from either of the first or second operand as specified by the third
+///    operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [16 x i8].
+/// \param __V2
+///    A 128-bit vector of [16 x i8].
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
+///    how the values are to be copied. The position of the mask bit corresponds
+///    to the most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 8-bit element in operand \a __V1 is copied to the same
+///    position in the result. When a mask bit is 1, the corresponding 8-bit
+///    element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [16 x i8] containing the copied values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 {
@@ -105,25 +502,68 @@
                                                (__v16qi)__M);
 }
 
-#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
-  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
-                                   (__v8hi)(__m128i)(V2), \
-                                   (((M) & 0x01) ?  8 : 0), \
-                                   (((M) & 0x02) ?  9 : 1), \
-                                   (((M) & 0x04) ? 10 : 2), \
-                                   (((M) & 0x08) ? 11 : 3), \
-                                   (((M) & 0x10) ? 12 : 4), \
-                                   (((M) & 0x20) ? 13 : 5), \
-                                   (((M) & 0x40) ? 14 : 6), \
-                                   (((M) & 0x80) ? 15 : 7)); })
+/// Returns a 128-bit vector of [8 x i16] where the values are selected
+///    from either of the first or second operand as specified by the third
+///    operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
+///
+/// \param V1
+///    A 128-bit vector of [8 x i16].
+/// \param V2
+///    A 128-bit vector of [8 x i16].
+/// \param M
+///    An immediate integer operand, with mask bits [7:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
+///    element in operand \a V1 is copied to the same position in the result.
+///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
+///    is copied to the same position in the result.
+/// \returns A 128-bit vector of [8 x i16] containing the copied values.
+#define _mm_blend_epi16(V1, V2, M) \
+  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+                                       (__v8hi)(__m128i)(V2), (int)(M))
 
 /* SSE4 Dword Multiply Instructions.  */
+/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
+///    and returns the lower 32 bits of the each product in a 128-bit vector of
+///    [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector.
+/// \param __V2
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the products of both operands.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
 }
 
+/// Multiplies corresponding even-indexed elements of two 128-bit
+///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
+///    containing the products.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32].
+/// \param __V2
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [2 x i64] containing the products of both
+///    operands.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 {
@@ -131,64 +571,243 @@
 }
 
 /* SSE4 Floating Point Dot Product Instructions.  */
-#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+/// Computes the dot product of the two 128-bit vectors of [4 x float]
+///    and returns it in the elements of the 128-bit result vector of
+///    [4 x float].
+///
+///    The immediate integer operand controls which input elements
+///    will contribute to the dot product, and where the final results are
+///    returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float].
+/// \param Y
+///    A 128-bit vector of [4 x float].
+/// \param M
+///    An immediate integer operand. Mask bits [7:4] determine which elements
+///    of the input vectors are used, with bit [4] corresponding to the lowest
+///    element and bit [7] corresponding to the highest element of each [4 x
+///    float] vector. If a bit is set, the corresponding elements from the two
+///    input vectors are used as an input for dot product; otherwise that input
+///    is treated as zero. Bits [3:0] determine which elements of the result
+///    will receive a copy of the final dot product, with bit [0] corresponding
+///    to the lowest element and bit [3] corresponding to the highest element of
+///    each [4 x float] subvector. If a bit is set, the dot product is returned
+///    in the corresponding element; otherwise that element is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the dot product.
+#define _mm_dp_ps(X, Y, M) \
   (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
-                               (__v4sf)(__m128)(Y), (M)); })
+                               (__v4sf)(__m128)(Y), (M))
 
-#define _mm_dp_pd(X, Y, M) __extension__ ({\
+/// Computes the dot product of the two 128-bit vectors of [2 x double]
+///    and returns it in the elements of the 128-bit result vector of
+///    [2 x double].
+///
+///    The immediate integer operand controls which input
+///    elements will contribute to the dot product, and where the final results
+///    are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double].
+/// \param Y
+///    A 128-bit vector of [2 x double].
+/// \param M
+///    An immediate integer operand. Mask bits [5:4] determine which elements
+///    of the input vectors are used, with bit [4] corresponding to the lowest
+///    element and bit [5] corresponding to the highest element of each of [2 x
+///    double] vector. If a bit is set, the corresponding elements from the two
+///    input vectors are used as an input for dot product; otherwise that input
+///    is treated as zero. Bits [1:0] determine which elements of the result
+///    will receive a copy of the final dot product, with bit [0] corresponding
+///    to the lowest element and bit [1] corresponding to the highest element of
+///    each [2 x double] vector. If a bit is set, the dot product is returned in
+///    the corresponding element; otherwise that element is set to zero.
+#define _mm_dp_pd(X, Y, M) \
   (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
-                                (__v2df)(__m128d)(Y), (M)); })
+                                (__v2df)(__m128d)(Y), (M))
 
 /* SSE4 Streaming Load Hint Instruction.  */
+/// Loads integer values from a 128-bit aligned memory location to a
+///    128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
+///
+/// \param __V
+///    A pointer to a 128-bit aligned memory location that contains the integer
+///    values.
+/// \returns A 128-bit integer vector containing the data stored at the
+///    specified memory location.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_stream_load_si128 (__m128i const *__V)
 {
-  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
+  return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
 }
 
 /* SSE4 Packed Integer Min/Max Instructions.  */
+/// Compares the corresponding elements of two 128-bit vectors of
+///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
+///    of the two values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [16 x i8].
+/// \param __V2
+///    A 128-bit vector of [16 x i8]
+/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi8 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
+/// Compares the corresponding elements of two 128-bit vectors of
+///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
+///    greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [16 x i8].
+/// \param __V2
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the greater values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi8 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
+/// Compares the corresponding elements of two 128-bit vectors of
+///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
+///    value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [8 x u16].
+/// \param __V2
+///    A 128-bit vector of [8 x u16].
+/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu16 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
+/// Compares the corresponding elements of two 128-bit vectors of
+///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
+///    greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [8 x u16].
+/// \param __V2
+///    A 128-bit vector of [8 x u16].
+/// \returns A 128-bit vector of [8 x u16] containing the greater values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu16 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
+/// Compares the corresponding elements of two 128-bit vectors of
+///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
+///    value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32].
+/// \param __V2
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
+/// Compares the corresponding elements of two 128-bit vectors of
+///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
+///    greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32].
+/// \param __V2
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the greater values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
+/// Compares the corresponding elements of two 128-bit vectors of
+///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
+///    value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x u32].
+/// \param __V2
+///    A 128-bit vector of [4 x u32].
+/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
 }
 
+/// Compares the corresponding elements of two 128-bit vectors of
+///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
+///    greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x u32].
+/// \param __V2
+///    A 128-bit vector of [4 x u32].
+/// \returns A 128-bit vector of [4 x u32] containing the greater values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu32 (__m128i __V1, __m128i __V2)
 {
@@ -196,17 +815,79 @@
 }
 
 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+/// Takes the first argument \a X and inserts an element from the second
+///    argument \a Y as selected by the third argument \a N. That result then
+///    has elements zeroed out also as selected by the third argument \a N. The
+///    resulting 128-bit vector of [4 x float] is then returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector source operand of [4 x float]. With the exception of
+///    those bits in the result copied from parameter \a Y and zeroed by bits
+///    [3:0] of \a N, all bits from this parameter are copied to the result.
+/// \param Y
+///    A 128-bit vector source operand of [4 x float]. One single-precision
+///    floating-point element from this source, as determined by the immediate
+///    parameter, is copied to the result.
+/// \param N
+///    Specifies which bits from operand \a Y will be copied, which bits in the
+///    result they will be be copied to, and which bits in the result will be
+///    cleared. The following assignments are made: \n
+///    Bits [7:6] specify the bits to copy from operand \a Y: \n
+///      00: Selects bits [31:0] from operand \a Y. \n
+///      01: Selects bits [63:32] from operand \a Y. \n
+///      10: Selects bits [95:64] from operand \a Y. \n
+///      11: Selects bits [127:96] from operand \a Y. \n
+///    Bits [5:4] specify the bits in the result to which the selected bits
+///    from operand \a Y are copied: \n
+///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
+///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
+///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
+///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
+///    Bits[3:0]: If any of these bits are set, the corresponding result
+///    element is cleared.
+/// \returns A 128-bit vector of [4 x float] containing the copied
+///    single-precision floating point elements from the operands.
 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+
+/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
+///    returns it, using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_ps(__m128 X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
+/// instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float].
+/// \param N
+///    An immediate value. Bits [1:0] determines which bits from the argument
+///    \a X are extracted and returned: \n
+///    00: Bits [31:0] of parameter \a X are returned. \n
+///    01: Bits [63:32] of parameter \a X are returned. \n
+///    10: Bits [95:64] of parameter \a X are returned. \n
+///    11: Bits [127:96] of parameter \a X are returned.
+/// \returns A 32-bit integer containing the extracted 32 bits of float data.
 #define _mm_extract_ps(X, N) (__extension__                      \
-                              ({ union { int __i; float __f; } __t;  \
-                                 __v4sf __a = (__v4sf)(__m128)(X);       \
-                                 __t.__f = __a[(N) & 3];                 \
-                                 __t.__i;}))
+  ({ union { int __i; float __f; } __t;  \
+     __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
+     __t.__i;}))
 
 /* Miscellaneous insert and extract macros.  */
 /* Extract a single-precision float from X at index N into D.  */
-#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
-                                                    (D) = __a[N]; }))
+#define _MM_EXTRACT_FLOAT(D, X, N) \
+  { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
 
 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
    an index suitable for _mm_insert_ps.  */
@@ -217,60 +898,328 @@
                                              _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
 
 /* Insert int into packed integer array at index.  */
-#define _mm_insert_epi8(X, I, N) (__extension__                           \
-                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
-                                     __a[(N) & 15] = (I);                 \
-                                     (__m128i)__a;}))
-#define _mm_insert_epi32(X, I, N) (__extension__                         \
-                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
-                                      __a[(N) & 3] = (I);                \
-                                      (__m128i)__a;}))
+/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
+///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
+///    of an integer parameter \a I into an offset specified by the immediate
+///    value parameter \a N.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
+///    result and then one of the sixteen elements in the result vector is
+///    replaced by the lower 8 bits of \a I.
+/// \param I
+///    An integer. The lower 8 bits of this operand are written to the result
+///    beginning at the offset specified by \a N.
+/// \param N
+///    An immediate value. Bits [3:0] specify the bit offset in the result at
+///    which the lower 8 bits of \a I are written. \n
+///    0000: Bits [7:0] of the result are used for insertion. \n
+///    0001: Bits [15:8] of the result are used for insertion. \n
+///    0010: Bits [23:16] of the result are used for insertion. \n
+///    0011: Bits [31:24] of the result are used for insertion. \n
+///    0100: Bits [39:32] of the result are used for insertion. \n
+///    0101: Bits [47:40] of the result are used for insertion. \n
+///    0110: Bits [55:48] of the result are used for insertion. \n
+///    0111: Bits [63:56] of the result are used for insertion. \n
+///    1000: Bits [71:64] of the result are used for insertion. \n
+///    1001: Bits [79:72] of the result are used for insertion. \n
+///    1010: Bits [87:80] of the result are used for insertion. \n
+///    1011: Bits [95:88] of the result are used for insertion. \n
+///    1100: Bits [103:96] of the result are used for insertion. \n
+///    1101: Bits [111:104] of the result are used for insertion. \n
+///    1110: Bits [119:112] of the result are used for insertion. \n
+///    1111: Bits [127:120] of the result are used for insertion.
+/// \returns A 128-bit integer vector containing the constructed values.
+#define _mm_insert_epi8(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
+                                        (int)(I), (int)(N))
+
+/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
+///    the 128-bit integer vector parameter, and then inserting the 32-bit
+///    integer parameter \a I at the offset specified by the immediate value
+///    parameter \a N.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
+///    result and then one of the four elements in the result vector is
+///    replaced by \a I.
+/// \param I
+///    A 32-bit integer that is written to the result beginning at the offset
+///    specified by \a N.
+/// \param N
+///    An immediate value. Bits [1:0] specify the bit offset in the result at
+///    which the integer \a I is written. \n
+///    00: Bits [31:0] of the result are used for insertion. \n
+///    01: Bits [63:32] of the result are used for insertion. \n
+///    10: Bits [95:64] of the result are used for insertion. \n
+///    11: Bits [127:96] of the result are used for insertion.
+/// \returns A 128-bit integer vector containing the constructed values.
+#define _mm_insert_epi32(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
+                                       (int)(I), (int)(N))
+
 #ifdef __x86_64__
-#define _mm_insert_epi64(X, I, N) (__extension__                         \
-                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
-                                      __a[(N) & 1] = (I);                \
-                                      (__m128i)__a;}))
+/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
+///    the 128-bit integer vector parameter, and then inserting the 64-bit
+///    integer parameter \a I, using the immediate value parameter \a N as an
+///    insertion location selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
+///    result and then one of the two elements in the result vector is replaced
+///    by \a I.
+/// \param I
+///    A 64-bit integer that is written to the result beginning at the offset
+///    specified by \a N.
+/// \param N
+///    An immediate value. Bit [0] specifies the bit offset in the result at
+///    which the integer \a I is written. \n
+///    0: Bits [63:0] of the result are used for insertion. \n
+///    1: Bits [127:64] of the result are used for insertion. \n
+/// \returns A 128-bit integer vector containing the constructed values.
+#define _mm_insert_epi64(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
+                                       (long long)(I), (int)(N))
 #endif /* __x86_64__ */
 
 /* Extract int from packed integer array at index.  This returns the element
  * as a zero extended value, so it is unsigned.
  */
-#define _mm_extract_epi8(X, N) (__extension__                           \
-                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
-                                   (int)(unsigned char) __a[(N) & 15];}))
-#define _mm_extract_epi32(X, N) (__extension__                         \
-                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
-                                    (int)__a[(N) & 3];}))
+/// Extracts an 8-bit element from the 128-bit integer vector of
+///    [16 x i8], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_epi8(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    An immediate value. Bits [3:0] specify which 8-bit vector element from
+///    the argument \a X to extract and copy to the result. \n
+///    0000: Bits [7:0] of parameter \a X are extracted. \n
+///    0001: Bits [15:8] of the parameter \a X are extracted. \n
+///    0010: Bits [23:16] of the parameter \a X are extracted. \n
+///    0011: Bits [31:24] of the parameter \a X are extracted. \n
+///    0100: Bits [39:32] of the parameter \a X are extracted. \n
+///    0101: Bits [47:40] of the parameter \a X are extracted. \n
+///    0110: Bits [55:48] of the parameter \a X are extracted. \n
+///    0111: Bits [63:56] of the parameter \a X are extracted. \n
+///    1000: Bits [71:64] of the parameter \a X are extracted. \n
+///    1001: Bits [79:72] of the parameter \a X are extracted. \n
+///    1010: Bits [87:80] of the parameter \a X are extracted. \n
+///    1011: Bits [95:88] of the parameter \a X are extracted. \n
+///    1100: Bits [103:96] of the parameter \a X are extracted. \n
+///    1101: Bits [111:104] of the parameter \a X are extracted. \n
+///    1110: Bits [119:112] of the parameter \a X are extracted. \n
+///    1111: Bits [127:120] of the parameter \a X are extracted.
+/// \returns  An unsigned integer, whose lower 8 bits are selected from the
+///    128-bit integer vector parameter and the remaining bits are assigned
+///    zeros.
+#define _mm_extract_epi8(X, N) \
+  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
+                                                   (int)(N))
+
+/// Extracts a 32-bit element from the 128-bit integer vector of
+///    [4 x i32], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_epi32(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    An immediate value. Bits [1:0] specify which 32-bit vector element from
+///    the argument \a X to extract and copy to the result. \n
+///    00: Bits [31:0] of the parameter \a X are extracted. \n
+///    01: Bits [63:32] of the parameter \a X are extracted. \n
+///    10: Bits [95:64] of the parameter \a X are extracted. \n
+///    11: Bits [127:96] of the parameter \a X are exracted.
+/// \returns  An integer, whose lower 32 bits are selected from the 128-bit
+///    integer vector parameter and the remaining bits are assigned zeros.
+#define _mm_extract_epi32(X, N) \
+  (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
+
 #ifdef __x86_64__
-#define _mm_extract_epi64(X, N) (__extension__                         \
-                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
-                                    (long long)__a[(N) & 1];}))
+/// Extracts a 64-bit element from the 128-bit integer vector of
+///    [2 x i64], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// long long _mm_extract_epi64(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    An immediate value. Bit [0] specifies which 64-bit vector element from
+///    the argument \a X to return. \n
+///    0: Bits [63:0] are returned. \n
+///    1: Bits [127:64] are returned. \n
+/// \returns  A 64-bit integer.
+#define _mm_extract_epi64(X, N) \
+  (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
 #endif /* __x86_64 */
 
 /* SSE4 128-bit Packed Integer Comparisons.  */
+/// Tests whether the specified bits in a 128-bit integer vector are all
+///    zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testz_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
 }
 
+/// Tests whether the specified bits in a 128-bit integer vector are all
+///    ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testc_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
 }
 
+/// Tests whether the specified bits in a 128-bit integer vector are
+///    neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones;
+///    FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testnzc_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
 }
 
+/// Tests whether the specified bits in a 128-bit integer vector are all
+///    ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_all_ones(__m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param V
+///    A 128-bit integer vector containing the bits to be tested.
+/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
+///    otherwise.
 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+///    neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param V
+///    A 128-bit integer vector selecting which bits to test in operand \a M.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones;
+///    FALSE otherwise.
 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+///    zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_all_zeros(__m128i M, __m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param V
+///    A 128-bit integer vector selecting which bits to test in operand \a M.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
 
 /* SSE4 64-bit Packed Integer Comparisons.  */
+/// Compares each of the corresponding 64-bit values of the 128-bit
+///    integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector.
+/// \param __V2
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 {
@@ -278,6 +1227,19 @@
 }
 
 /* SSE4 Packed Integer Sign-Extension.  */
+/// Sign-extends each of the lower eight 8-bit integer elements of a
+///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
+///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
+///    extended to 16-bit values.
+/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi16(__m128i __V)
 {
@@ -286,6 +1248,19 @@
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
+/// Sign-extends each of the lower four 8-bit integer elements of a
+///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
+///    128-bit vector of [4 x i32]. The upper twelve elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
+///    sign-extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi32(__m128i __V)
 {
@@ -294,6 +1269,19 @@
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// Sign-extends each of the lower two 8-bit integer elements of a
+///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
+///    sign-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi64(__m128i __V)
 {
@@ -302,18 +1290,57 @@
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }
 
+/// Sign-extends each of the lower four 16-bit integer elements of a
+///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
+///    a 128-bit vector of [4 x i32]. The upper four elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
+///    sign-extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi32(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// Sign-extends each of the lower two 16-bit integer elements of a
+///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper six elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
+///     sign-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi64(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
 }
 
+/// Sign-extends each of the lower two 32-bit integer elements of a
+///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
+///    sign-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi64(__m128i __V)
 {
@@ -321,36 +1348,114 @@
 }
 
 /* SSE4 Packed Integer Zero-Extension.  */
+/// Zero-extends each of the lower eight 8-bit integer elements of a
+///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
+///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
+///    zero-extended to 16-bit values.
+/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi16(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
+/// Zero-extends each of the lower four 8-bit integer elements of a
+///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
+///    128-bit vector of [4 x i32]. The upper twelve elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
+///    zero-extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi32(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// Zero-extends each of the lower two 8-bit integer elements of a
+///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
+///    zero-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi64(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
 }
 
+/// Zero-extends each of the lower four 16-bit integer elements of a
+///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
+///    a 128-bit vector of [4 x i32]. The upper four elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
+///    zero-extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi32(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// Zero-extends each of the lower two 16-bit integer elements of a
+///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
+///    zero-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi64(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
 }
 
+/// Zero-extends each of the lower two 32-bit integer elements of a
+///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
+///    zero-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu32_epi64(__m128i __V)
 {
@@ -358,6 +1463,28 @@
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
+/// Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit unsigned integers, and returns the packed result.
+///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
+///    0x0000 are saturated to 0x0000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
+///    signed integer and is converted to a 16-bit unsigned integer with
+///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
+///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
+///    are written to the lower 64 bits of the result.
+/// \param __V2
+///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
+///    signed integer and is converted to a 16-bit unsigned integer with
+///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
+///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
+///    are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi32(__m128i __V1, __m128i __V2)
 {
@@ -365,10 +1492,58 @@
 }
 
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
-#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+/// Subtracts 8-bit unsigned integer values and computes the absolute
+///    values of the differences to the corresponding bits in the destination.
+///    Then sums of the absolute differences are returned according to the bit
+///    fields in the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [16 x i8].
+/// \param Y
+///    A 128-bit vector of [16 x i8].
+/// \param M
+///    An 8-bit immediate operand specifying how the absolute differences are to
+///    be calculated, according to the following algorithm:
+///    \code
+///    // M2 represents bit 2 of the immediate operand
+///    // M10 represents bits [1:0] of the immediate operand
+///    i = M2 * 4;
+///    j = M10 * 4;
+///    for (k = 0; k < 8; k = k + 1) {
+///      d0 = abs(X[i + k + 0] - Y[j + 0]);
+///      d1 = abs(X[i + k + 1] - Y[j + 1]);
+///      d2 = abs(X[i + k + 2] - Y[j + 2]);
+///      d3 = abs(X[i + k + 3] - Y[j + 3]);
+///      r[k] = d0 + d1 + d2 + d3;
+///    }
+///    \endcode
+/// \returns A 128-bit integer vector containing the sums of the sets of
+///    absolute differences between both operands.
+#define _mm_mpsadbw_epu8(X, Y, M) \
   (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
-                                      (__v16qi)(__m128i)(Y), (M)); })
+                                      (__v16qi)(__m128i)(Y), (M))
 
+/// Finds the minimum unsigned 16-bit element in the input 128-bit
+///    vector of [8 x u16] and returns it and along with its index.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
+/// instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x u16].
+/// \returns A 128-bit value where bits [15:0] contain the minimum value found
+///    in parameter \a __V, bits [18:16] contain the index of the minimum value
+///    and the remaining bits are set to 0.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_minpos_epu16(__m128i __V)
 {
@@ -410,61 +1585,769 @@
 #define _SIDD_UNIT_MASK                 0x40
 
 /* SSE4.2 Packed Comparison Intrinsics.  */
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns a 128-bit integer vector representing the result
+///    mask of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words, the type of comparison to perform, and the format of the return
+///    value. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
+///             bytes. \n
+///      0: The result is zero-extended to 16 bytes. \n
+///      1: The result is expanded to 16 bytes (this expansion is performed by
+///         repeating each bit 8 or 16 times).
+/// \returns Returns a 128-bit integer vector representing the result mask of
+///    the comparison.
 #define _mm_cmpistrm(A, B, M) \
   (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
                                        (__v16qi)(__m128i)(B), (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns an integer representing the result index of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words, the type of comparison to perform, and the format of the return
+///    value. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+///    Bit [6]: Determines whether the index of the lowest set bit or the
+///             highest set bit is returned. \n
+///      0: The index of the least significant set bit. \n
+///      1: The index of the most significant set bit. \n
+/// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpistri(A, B, M) \
   (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))
 
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns a 128-bit integer vector representing the result
+///    mask of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words, the type of comparison to perform, and the format of the return
+///    value. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
+///             bytes. \n
+///      0: The result is zero-extended to 16 bytes. \n
+///      1: The result is expanded to 16 bytes (this expansion is performed by
+///         repeating each bit 8 or 16 times). \n
+/// \returns Returns a 128-bit integer vector representing the result mask of
+///    the comparison.
 #define _mm_cmpestrm(A, LA, B, LB, M) \
   (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
                                        (__v16qi)(__m128i)(B), (int)(LB), \
                                        (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns an integer representing the result index of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words, the type of comparison to perform, and the format of the return
+///    value. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+///    Bit [6]: Determines whether the index of the lowest set bit or the
+///             highest set bit is returned. \n
+///      0: The index of the least significant set bit. \n
+///      1: The index of the most significant set bit. \n
+/// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpestri(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
                                    (__v16qi)(__m128i)(B), (int)(LB), \
                                    (int)(M))
 
 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
+///    string in \a B is the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns 1 if the bit mask is zero and the length of the string in
+///    \a B is the maximum; otherwise, returns 0.
 #define _mm_cmpistra(A, B, M) \
   (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
+///    0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
 #define _mm_cmpistrc(A, B, M) \
   (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns bit 0 of the resulting bit mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpistro(A, B, M) \
   (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
+///    the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns 1 if the length of the string in \a A is less than the
+///    maximum, otherwise, returns 0.
 #define _mm_cmpistrs(A, B, M) \
   (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
+///    the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns 1 if the length of the string in \a B is less than the
+///    maximum, otherwise, returns 0.
 #define _mm_cmpistrz(A, B, M) \
   (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
 
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
+///    string in \a B is the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns 1 if the bit mask is zero and the length of the string in
+///    \a B is the maximum, otherwise, returns 0.
 #define _mm_cmpestra(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
+///    returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
 #define _mm_cmpestrc(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns bit 0 of the resulting bit mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpestro(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
+///    the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement in the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns 1 if the length of the string in \a A is less than the
+///    maximum, otherwise, returns 0.
 #define _mm_cmpestrs(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
+///    the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes  \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns 1 if the length of the string in \a B is less than the
+///    maximum, otherwise, returns 0.
 #define _mm_cmpestrz(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
 
 /* SSE4.2 Compare Packed Data -- Greater Than.  */
+/// Compares each of the corresponding 64-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector.
+/// \param __V2
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
 {
@@ -472,18 +2355,60 @@
 }
 
 /* SSE4.2 Accumulate CRC32.  */
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned char operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a  __D.
+/// \param __D
+///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u8(unsigned int __C, unsigned char __D)
 {
   return __builtin_ia32_crc32qi(__C, __D);
 }
 
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned short operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u16(unsigned int __C, unsigned short __D)
 {
   return __builtin_ia32_crc32hi(__C, __D);
 }
 
+/// Adds the first unsigned integer operand to the CRC-32C checksum of
+///    the second unsigned integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u32(unsigned int __C, unsigned int __D)
 {
@@ -491,6 +2416,20 @@
 }
 
 #ifdef __x86_64__
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned 64-bit integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
 {
@@ -500,8 +2439,6 @@
 
 #undef __DEFAULT_FN_ATTRS
 
-#ifdef __POPCNT__
 #include <popcntintrin.h>
-#endif
 
-#endif /* _SMMINTRIN_H */
+#endif /* __SMMINTRIN_H */
diff --git a/darwin-x86/clang-headers/stdarg.h b/darwin-x86/clang-headers/stdarg.h
index a57e183..101426f 100644
--- a/darwin-x86/clang-headers/stdarg.h
+++ b/darwin-x86/clang-headers/stdarg.h
@@ -43,10 +43,9 @@
 #define va_copy(dest, src)  __builtin_va_copy(dest, src)
 #endif
 
-/* Hack required to make standard headers work, at least on Ubuntu */
 #ifndef __GNUC_VA_LIST
 #define __GNUC_VA_LIST 1
-#endif
 typedef __builtin_va_list __gnuc_va_list;
+#endif
 
 #endif /* __STDARG_H */
diff --git a/darwin-x86/clang-headers/stdatomic.h b/darwin-x86/clang-headers/stdatomic.h
index e037987..3c61334 100644
--- a/darwin-x86/clang-headers/stdatomic.h
+++ b/darwin-x86/clang-headers/stdatomic.h
@@ -1,190 +1,387 @@
-/*===---- stdatomic.h - Standard header for atomic types and operations -----===
+/*-
+ * Copyright (c) 2011 Ed Schouten <ed@FreeBSD.org>
+ *                    David Chisnall <theraven@FreeBSD.org>
+ * All rights reserved.
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
+ * $FreeBSD$
  */
 
-#ifndef __CLANG_STDATOMIC_H
-#define __CLANG_STDATOMIC_H
+#ifndef _STDATOMIC_H_
+#define	_STDATOMIC_H_
 
-/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
- * example, already has a Clang-compatible stdatomic.h header.
- */
-#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
-# include_next <stdatomic.h>
-#else
+#include <sys/cdefs.h>
 
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
+#if defined(__cplusplus) && __cplusplus >= 201103L && defined(_USING_LIBCXX)
+# if __has_feature(cxx_atomic)
+#  define _STDATOMIC_HAVE_ATOMIC
+# endif
 #endif
 
-/* 7.17.1 Introduction */
+#ifdef _STDATOMIC_HAVE_ATOMIC
 
-#define ATOMIC_BOOL_LOCK_FREE       __GCC_ATOMIC_BOOL_LOCK_FREE
-#define ATOMIC_CHAR_LOCK_FREE       __GCC_ATOMIC_CHAR_LOCK_FREE
-#define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-#define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-#define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-#define ATOMIC_SHORT_T_LOCK_FREE    __GCC_ATOMIC_SHORT_T_LOCK_FREE
-#define ATOMIC_INT_T_LOCK_FREE      __GCC_ATOMIC_INT_T_LOCK_FREE
-#define ATOMIC_LONG_T_LOCK_FREE     __GCC_ATOMIC_LONG_T_LOCK_FREE
-#define ATOMIC_LLONG_T_LOCK_FREE    __GCC_ATOMIC_LLONG_T_LOCK_FREE
-#define ATOMIC_POINTER_T_LOCK_FREE  __GCC_ATOMIC_POINTER_T_LOCK_FREE
+/* We have a usable C++ <atomic>; use it instead.  */
 
-/* 7.17.2 Initialization */
+#include <atomic>
 
-#define ATOMIC_VAR_INIT(value) (value)
-#define atomic_init __c11_atomic_init
+#undef _Atomic
+        /* Also defined by <atomic> for gcc.  But not used in macros. */
+        /* Also a clang intrinsic.                                    */
+        /* Should not be used by client code before this file is      */
+        /* included.  The definitions in <atomic> themselves see      */
+        /* the old definition, as they should.                        */
+        /* Client code sees the following definition.                 */
 
-/* 7.17.3 Order and consistency */
+#define _Atomic(t) std::atomic<t>
 
-typedef enum memory_order {
-  memory_order_relaxed = __ATOMIC_RELAXED,
-  memory_order_consume = __ATOMIC_CONSUME,
-  memory_order_acquire = __ATOMIC_ACQUIRE,
-  memory_order_release = __ATOMIC_RELEASE,
-  memory_order_acq_rel = __ATOMIC_ACQ_REL,
-  memory_order_seq_cst = __ATOMIC_SEQ_CST
+using std::atomic_is_lock_free;
+using std::atomic_init;
+using std::atomic_store;
+using std::atomic_store_explicit;
+using std::atomic_load;
+using std::atomic_load_explicit;
+using std::atomic_exchange;
+using std::atomic_exchange_explicit;
+using std::atomic_compare_exchange_strong;
+using std::atomic_compare_exchange_strong_explicit;
+using std::atomic_compare_exchange_weak;
+using std::atomic_compare_exchange_weak_explicit;
+using std::atomic_fetch_add;
+using std::atomic_fetch_add_explicit;
+using std::atomic_fetch_sub;
+using std::atomic_fetch_sub_explicit;
+using std::atomic_fetch_or;
+using std::atomic_fetch_or_explicit;
+using std::atomic_fetch_xor;
+using std::atomic_fetch_xor_explicit;
+using std::atomic_fetch_and;
+using std::atomic_fetch_and_explicit;
+using std::atomic_thread_fence;
+using std::atomic_signal_fence;
+
+using std::memory_order;
+using std::memory_order_relaxed;
+using std::memory_order_consume;
+using std::memory_order_acquire;
+using std::memory_order_release;
+using std::memory_order_acq_rel;
+using std::memory_order_seq_cst;
+
+using std::atomic_bool;
+using std::atomic_char;
+using std::atomic_schar;
+using std::atomic_uchar;
+using std::atomic_short;
+using std::atomic_ushort;
+using std::atomic_int;
+using std::atomic_uint;
+using std::atomic_long;
+using std::atomic_ulong;
+using std::atomic_llong;
+using std::atomic_ullong;
+using std::atomic_char16_t;
+using std::atomic_char32_t;
+using std::atomic_wchar_t;
+using std::atomic_int_least8_t;
+using std::atomic_uint_least8_t;
+using std::atomic_int_least16_t;
+using std::atomic_uint_least16_t;
+using std::atomic_int_least32_t;
+using std::atomic_uint_least32_t;
+using std::atomic_int_least64_t;
+using std::atomic_uint_least64_t;
+using std::atomic_int_fast8_t;
+using std::atomic_uint_fast8_t;
+using std::atomic_int_fast16_t;
+using std::atomic_uint_fast16_t;
+using std::atomic_int_fast32_t;
+using std::atomic_uint_fast32_t;
+using std::atomic_int_fast64_t;
+using std::atomic_uint_fast64_t;
+using std::atomic_intptr_t;
+using std::atomic_uintptr_t;
+using std::atomic_size_t;
+using std::atomic_ptrdiff_t;
+using std::atomic_intmax_t;
+using std::atomic_uintmax_t;
+
+#else /* <atomic> unavailable, possibly because this is C, not C++ */
+
+#include <sys/types.h>
+#include <stdbool.h>
+
+/*
+ * C: Do it ourselves.
+ * Note that the runtime representation defined here should be compatible
+ * with the C++ one, i.e. an _Atomic(T) needs to contain the same
+ * bits as a T.
+ */
+
+#include <stddef.h>  /* For ptrdiff_t. */
+#include <stdint.h>  /* TODO: don't drag in all the macros, just the types. */
+// Include uchar.h only when available.  Bionic's stdatomic.h is also used for
+// the host (via a copy in prebuilts/clang) and uchar.h is not available in the
+// glibc used for the host.
+#if defined(__BIONIC__)
+# include <uchar.h>  /* For char16_t and char32_t.              */
+#endif
+
+/*
+ * 7.17.1 Atomic lock-free macros.
+ */
+
+#ifdef __GCC_ATOMIC_BOOL_LOCK_FREE
+#define	ATOMIC_BOOL_LOCK_FREE		__GCC_ATOMIC_BOOL_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_CHAR_LOCK_FREE
+#define	ATOMIC_CHAR_LOCK_FREE		__GCC_ATOMIC_CHAR_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#define	ATOMIC_CHAR16_T_LOCK_FREE	__GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#define	ATOMIC_CHAR32_T_LOCK_FREE	__GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#define	ATOMIC_WCHAR_T_LOCK_FREE	__GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_SHORT_LOCK_FREE
+#define	ATOMIC_SHORT_LOCK_FREE		__GCC_ATOMIC_SHORT_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_INT_LOCK_FREE
+#define	ATOMIC_INT_LOCK_FREE		__GCC_ATOMIC_INT_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_LONG_LOCK_FREE
+#define	ATOMIC_LONG_LOCK_FREE		__GCC_ATOMIC_LONG_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_LLONG_LOCK_FREE
+#define	ATOMIC_LLONG_LOCK_FREE		__GCC_ATOMIC_LLONG_LOCK_FREE
+#endif
+#ifdef __GCC_ATOMIC_POINTER_LOCK_FREE
+#define	ATOMIC_POINTER_LOCK_FREE	__GCC_ATOMIC_POINTER_LOCK_FREE
+#endif
+
+/*
+ * 7.17.2 Initialization.
+ */
+
+#define	ATOMIC_VAR_INIT(value)		(value)
+#define	atomic_init(obj, value)		__c11_atomic_init(obj, value)
+
+/*
+ * Clang and recent GCC both provide predefined macros for the memory
+ * orderings.  If we are using a compiler that doesn't define them, use the
+ * clang values - these will be ignored in the fallback path.
+ */
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED		0
+#endif
+#ifndef __ATOMIC_CONSUME
+#define __ATOMIC_CONSUME		1
+#endif
+#ifndef __ATOMIC_ACQUIRE
+#define __ATOMIC_ACQUIRE		2
+#endif
+#ifndef __ATOMIC_RELEASE
+#define __ATOMIC_RELEASE		3
+#endif
+#ifndef __ATOMIC_ACQ_REL
+#define __ATOMIC_ACQ_REL		4
+#endif
+#ifndef __ATOMIC_SEQ_CST
+#define __ATOMIC_SEQ_CST		5
+#endif
+
+/*
+ * 7.17.3 Order and consistency.
+ *
+ * The memory_order_* constants that denote the barrier behaviour of the
+ * atomic operations.
+ * The enum values must be identical to those used by the
+ * C++ <atomic> header.
+ */
+
+typedef enum {
+	memory_order_relaxed = __ATOMIC_RELAXED,
+	memory_order_consume = __ATOMIC_CONSUME,
+	memory_order_acquire = __ATOMIC_ACQUIRE,
+	memory_order_release = __ATOMIC_RELEASE,
+	memory_order_acq_rel = __ATOMIC_ACQ_REL,
+	memory_order_seq_cst = __ATOMIC_SEQ_CST
 } memory_order;
 
-#define kill_dependency(y) (y)
+/*
+ * 7.17.4 Fences.
+ */
 
-/* 7.17.4 Fences */
-
-/* These should be provided by the libc implementation. */
-void atomic_thread_fence(memory_order);
-void atomic_signal_fence(memory_order);
-
-#define atomic_thread_fence(order) __c11_atomic_thread_fence(order)
-#define atomic_signal_fence(order) __c11_atomic_signal_fence(order)
-
-/* 7.17.5 Lock-free property */
-
-#define atomic_is_lock_free(obj) __c11_atomic_is_lock_free(sizeof(*(obj)))
-
-/* 7.17.6 Atomic integer types */
-
-#ifdef __cplusplus
-typedef _Atomic(bool)               atomic_bool;
-#else
-typedef _Atomic(_Bool)              atomic_bool;
-#endif
-typedef _Atomic(char)               atomic_char;
-typedef _Atomic(signed char)        atomic_schar;
-typedef _Atomic(unsigned char)      atomic_uchar;
-typedef _Atomic(short)              atomic_short;
-typedef _Atomic(unsigned short)     atomic_ushort;
-typedef _Atomic(int)                atomic_int;
-typedef _Atomic(unsigned int)       atomic_uint;
-typedef _Atomic(long)               atomic_long;
-typedef _Atomic(unsigned long)      atomic_ulong;
-typedef _Atomic(long long)          atomic_llong;
-typedef _Atomic(unsigned long long) atomic_ullong;
-typedef _Atomic(uint_least16_t)     atomic_char16_t;
-typedef _Atomic(uint_least32_t)     atomic_char32_t;
-typedef _Atomic(wchar_t)            atomic_wchar_t;
-typedef _Atomic(int_least8_t)       atomic_int_least8_t;
-typedef _Atomic(uint_least8_t)      atomic_uint_least8_t;
-typedef _Atomic(int_least16_t)      atomic_int_least16_t;
-typedef _Atomic(uint_least16_t)     atomic_uint_least16_t;
-typedef _Atomic(int_least32_t)      atomic_int_least32_t;
-typedef _Atomic(uint_least32_t)     atomic_uint_least32_t;
-typedef _Atomic(int_least64_t)      atomic_int_least64_t;
-typedef _Atomic(uint_least64_t)     atomic_uint_least64_t;
-typedef _Atomic(int_fast8_t)        atomic_int_fast8_t;
-typedef _Atomic(uint_fast8_t)       atomic_uint_fast8_t;
-typedef _Atomic(int_fast16_t)       atomic_int_fast16_t;
-typedef _Atomic(uint_fast16_t)      atomic_uint_fast16_t;
-typedef _Atomic(int_fast32_t)       atomic_int_fast32_t;
-typedef _Atomic(uint_fast32_t)      atomic_uint_fast32_t;
-typedef _Atomic(int_fast64_t)       atomic_int_fast64_t;
-typedef _Atomic(uint_fast64_t)      atomic_uint_fast64_t;
-typedef _Atomic(intptr_t)           atomic_intptr_t;
-typedef _Atomic(uintptr_t)          atomic_uintptr_t;
-typedef _Atomic(size_t)             atomic_size_t;
-typedef _Atomic(ptrdiff_t)          atomic_ptrdiff_t;
-typedef _Atomic(intmax_t)           atomic_intmax_t;
-typedef _Atomic(uintmax_t)          atomic_uintmax_t;
-
-/* 7.17.7 Operations on atomic types */
-
-#define atomic_store(object, desired) __c11_atomic_store(object, desired, __ATOMIC_SEQ_CST)
-#define atomic_store_explicit __c11_atomic_store
-
-#define atomic_load(object) __c11_atomic_load(object, __ATOMIC_SEQ_CST)
-#define atomic_load_explicit __c11_atomic_load
-
-#define atomic_exchange(object, desired) __c11_atomic_exchange(object, desired, __ATOMIC_SEQ_CST)
-#define atomic_exchange_explicit __c11_atomic_exchange
-
-#define atomic_compare_exchange_strong(object, expected, desired) __c11_atomic_compare_exchange_strong(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-#define atomic_compare_exchange_strong_explicit __c11_atomic_compare_exchange_strong
-
-#define atomic_compare_exchange_weak(object, expected, desired) __c11_atomic_compare_exchange_weak(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-#define atomic_compare_exchange_weak_explicit __c11_atomic_compare_exchange_weak
-
-#define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_add_explicit __c11_atomic_fetch_add
-
-#define atomic_fetch_sub(object, operand) __c11_atomic_fetch_sub(object, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_sub_explicit __c11_atomic_fetch_sub
-
-#define atomic_fetch_or(object, operand) __c11_atomic_fetch_or(object, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_or_explicit __c11_atomic_fetch_or
-
-#define atomic_fetch_xor(object, operand) __c11_atomic_fetch_xor(object, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_xor_explicit __c11_atomic_fetch_xor
-
-#define atomic_fetch_and(object, operand) __c11_atomic_fetch_and(object, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_and_explicit __c11_atomic_fetch_and
-
-/* 7.17.8 Atomic flag type and operations */
-
-typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
-
-#define ATOMIC_FLAG_INIT { 0 }
-
-/* These should be provided by the libc implementation. */
-#ifdef __cplusplus
-bool atomic_flag_test_and_set(volatile atomic_flag *);
-bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
-#else
-_Bool atomic_flag_test_and_set(volatile atomic_flag *);
-_Bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
-#endif
-void atomic_flag_clear(volatile atomic_flag *);
-void atomic_flag_clear_explicit(volatile atomic_flag *, memory_order);
-
-#define atomic_flag_test_and_set(object) __c11_atomic_exchange(&(object)->_Value, 1, __ATOMIC_SEQ_CST)
-#define atomic_flag_test_and_set_explicit(object, order) __c11_atomic_exchange(&(object)->_Value, 1, order)
-
-#define atomic_flag_clear(object) __c11_atomic_store(&(object)->_Value, 0, __ATOMIC_SEQ_CST)
-#define atomic_flag_clear_explicit(object, order) __c11_atomic_store(&(object)->_Value, 0, order)
-
-#ifdef __cplusplus
+static __inline void atomic_thread_fence(memory_order __order __attribute__((unused))) {
+	__c11_atomic_thread_fence(__order);
 }
+
+static __inline void atomic_signal_fence(memory_order __order __attribute__((unused))) {
+	__c11_atomic_signal_fence(__order);
+}
+
+/*
+ * 7.17.5 Lock-free property.
+ */
+
+#define	atomic_is_lock_free(obj) __c11_atomic_is_lock_free(sizeof(*(obj)))
+
+/*
+ * 7.17.6 Atomic integer types.
+ */
+
+typedef _Atomic(bool)			atomic_bool;
+typedef _Atomic(char)			atomic_char;
+typedef _Atomic(signed char)		atomic_schar;
+typedef _Atomic(unsigned char)		atomic_uchar;
+typedef _Atomic(short)			atomic_short;
+typedef _Atomic(unsigned short)		atomic_ushort;
+typedef _Atomic(int)			atomic_int;
+typedef _Atomic(unsigned int)		atomic_uint;
+typedef _Atomic(long)			atomic_long;
+typedef _Atomic(unsigned long)		atomic_ulong;
+typedef _Atomic(long long)		atomic_llong;
+typedef _Atomic(unsigned long long)	atomic_ullong;
+#if defined(__BIONIC__) || (defined(__cplusplus) && __cplusplus >= 201103L)
+  typedef _Atomic(char16_t)		atomic_char16_t;
+  typedef _Atomic(char32_t)		atomic_char32_t;
 #endif
+typedef _Atomic(wchar_t)		atomic_wchar_t;
+typedef _Atomic(int_least8_t)		atomic_int_least8_t;
+typedef _Atomic(uint_least8_t)	atomic_uint_least8_t;
+typedef _Atomic(int_least16_t)	atomic_int_least16_t;
+typedef _Atomic(uint_least16_t)	atomic_uint_least16_t;
+typedef _Atomic(int_least32_t)	atomic_int_least32_t;
+typedef _Atomic(uint_least32_t)	atomic_uint_least32_t;
+typedef _Atomic(int_least64_t)	atomic_int_least64_t;
+typedef _Atomic(uint_least64_t)	atomic_uint_least64_t;
+typedef _Atomic(int_fast8_t)		atomic_int_fast8_t;
+typedef _Atomic(uint_fast8_t)		atomic_uint_fast8_t;
+typedef _Atomic(int_fast16_t)		atomic_int_fast16_t;
+typedef _Atomic(uint_fast16_t)	atomic_uint_fast16_t;
+typedef _Atomic(int_fast32_t)		atomic_int_fast32_t;
+typedef _Atomic(uint_fast32_t)	atomic_uint_fast32_t;
+typedef _Atomic(int_fast64_t)		atomic_int_fast64_t;
+typedef _Atomic(uint_fast64_t)	atomic_uint_fast64_t;
+typedef _Atomic(intptr_t)		atomic_intptr_t;
+typedef _Atomic(uintptr_t)		atomic_uintptr_t;
+typedef _Atomic(size_t)		atomic_size_t;
+typedef _Atomic(ptrdiff_t)		atomic_ptrdiff_t;
+typedef _Atomic(intmax_t)		atomic_intmax_t;
+typedef _Atomic(uintmax_t)		atomic_uintmax_t;
 
-#endif /* __STDC_HOSTED__ */
-#endif /* __CLANG_STDATOMIC_H */
+/*
+ * 7.17.7 Operations on atomic types.
+ */
 
+/*
+ * Compiler-specific operations.
+ */
+
+#define	atomic_compare_exchange_strong_explicit(object, expected,	\
+    desired, success, failure)						\
+	__c11_atomic_compare_exchange_strong(object, expected, desired,	\
+	    success, failure)
+#define	atomic_compare_exchange_weak_explicit(object, expected,		\
+    desired, success, failure)						\
+	__c11_atomic_compare_exchange_weak(object, expected, desired,	\
+	    success, failure)
+#define	atomic_exchange_explicit(object, desired, order)		\
+	__c11_atomic_exchange(object, desired, order)
+#define	atomic_fetch_add_explicit(object, operand, order)		\
+	__c11_atomic_fetch_add(object, operand, order)
+#define	atomic_fetch_and_explicit(object, operand, order)		\
+	__c11_atomic_fetch_and(object, operand, order)
+#define	atomic_fetch_or_explicit(object, operand, order)		\
+	__c11_atomic_fetch_or(object, operand, order)
+#define	atomic_fetch_sub_explicit(object, operand, order)		\
+	__c11_atomic_fetch_sub(object, operand, order)
+#define	atomic_fetch_xor_explicit(object, operand, order)		\
+	__c11_atomic_fetch_xor(object, operand, order)
+#define	atomic_load_explicit(object, order)				\
+	__c11_atomic_load(object, order)
+#define	atomic_store_explicit(object, desired, order)			\
+	__c11_atomic_store(object, desired, order)
+
+/*
+ * Convenience functions.
+ */
+
+#define	atomic_compare_exchange_strong(object, expected, desired)	\
+	atomic_compare_exchange_strong_explicit(object, expected,	\
+	    desired, memory_order_seq_cst, memory_order_seq_cst)
+#define	atomic_compare_exchange_weak(object, expected, desired)		\
+	atomic_compare_exchange_weak_explicit(object, expected,		\
+	    desired, memory_order_seq_cst, memory_order_seq_cst)
+#define	atomic_exchange(object, desired)				\
+	atomic_exchange_explicit(object, desired, memory_order_seq_cst)
+#define	atomic_fetch_add(object, operand)				\
+	atomic_fetch_add_explicit(object, operand, memory_order_seq_cst)
+#define	atomic_fetch_and(object, operand)				\
+	atomic_fetch_and_explicit(object, operand, memory_order_seq_cst)
+#define	atomic_fetch_or(object, operand)				\
+	atomic_fetch_or_explicit(object, operand, memory_order_seq_cst)
+#define	atomic_fetch_sub(object, operand)				\
+	atomic_fetch_sub_explicit(object, operand, memory_order_seq_cst)
+#define	atomic_fetch_xor(object, operand)				\
+	atomic_fetch_xor_explicit(object, operand, memory_order_seq_cst)
+#define	atomic_load(object)						\
+	atomic_load_explicit(object, memory_order_seq_cst)
+#define	atomic_store(object, desired)					\
+	atomic_store_explicit(object, desired, memory_order_seq_cst)
+
+/*
+ * 7.17.8 Atomic flag type and operations.
+ *
+ * XXX: Assume atomic_bool can be used as an atomic_flag. Is there some
+ * kind of compiler built-in type we could use?
+ */
+
+typedef struct {
+	atomic_bool	__flag;
+} atomic_flag;
+
+#define	ATOMIC_FLAG_INIT		{ ATOMIC_VAR_INIT(false) }
+
+static __inline bool atomic_flag_test_and_set_explicit(volatile atomic_flag *__object, memory_order __order) {
+	return (atomic_exchange_explicit(&__object->__flag, 1, __order));
+}
+
+static __inline void atomic_flag_clear_explicit(volatile atomic_flag *__object, memory_order __order) {
+	atomic_store_explicit(&__object->__flag, 0, __order);
+}
+
+static __inline bool atomic_flag_test_and_set(volatile atomic_flag *__object) {
+	return (atomic_flag_test_and_set_explicit(__object, memory_order_seq_cst));
+}
+
+static __inline void atomic_flag_clear(volatile atomic_flag *__object) {
+	atomic_flag_clear_explicit(__object, memory_order_seq_cst);
+}
+
+#endif /* <atomic> unavailable */
+
+#endif /* !_STDATOMIC_H_ */
diff --git a/darwin-x86/clang-headers/stdbool.h b/darwin-x86/clang-headers/stdbool.h
index 0467893..5cb66b5 100644
--- a/darwin-x86/clang-headers/stdbool.h
+++ b/darwin-x86/clang-headers/stdbool.h
@@ -32,12 +32,15 @@
 #define true 1
 #define false 0
 #elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
-/* Define _Bool, bool, false, true as a GNU extension. */
+/* Define _Bool as a GNU extension. */
 #define _Bool bool
+#if __cplusplus < 201103L
+/* For C++98, define bool, false, true as a GNU extension. */
 #define bool  bool
 #define false false
 #define true  true
 #endif
+#endif
 
 #define __bool_true_false_are_defined 1
 
diff --git a/darwin-x86/clang-headers/stdint.h b/darwin-x86/clang-headers/stdint.h
index 3f2fcbc..0afcca3 100644
--- a/darwin-x86/clang-headers/stdint.h
+++ b/darwin-x86/clang-headers/stdint.h
@@ -88,7 +88,7 @@
  *
  * To accommodate targets that are missing types that are exactly 8, 16, 32, or
  * 64 bits wide, this implementation takes an approach of cascading
- * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * redefinitions, redefining __int_leastN_t to successively smaller exact-width
  * types. It is therefore important that the types are defined in order of
  * descending widths.
  *
@@ -255,19 +255,16 @@
  */
 #define __stdint_join3(a,b,c) a ## b ## c
 
-#define  __intn_t(n) __stdint_join3( int, n, _t)
-#define __uintn_t(n) __stdint_join3(uint, n, _t)
-
 #ifndef _INTPTR_T
 #ifndef __intptr_t_defined
-typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+typedef __INTPTR_TYPE__ intptr_t;
 #define __intptr_t_defined
 #define _INTPTR_T
 #endif
 #endif
 
 #ifndef _UINTPTR_T
-typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+typedef __UINTPTR_TYPE__ uintptr_t;
 #define _UINTPTR_T
 #endif
 
@@ -464,7 +461,7 @@
  * As in the type definitions, this section takes an approach of
  * successive-shrinking to determine which limits to use for the standard (8,
  * 16, 32, 64) bit widths when they don't have exact representations. It is
- * therefore important that the defintions be kept in order of decending
+ * therefore important that the definitions be kept in order of decending
  * widths.
  *
  * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
@@ -659,12 +656,12 @@
 /* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
 /* C99 7.18.3 Limits of other integer types. */
 
-#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
-#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
-#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
-#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
-#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
-#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+#define  INTPTR_MIN  (-__INTPTR_MAX__-1)
+#define  INTPTR_MAX    __INTPTR_MAX__
+#define UINTPTR_MAX   __UINTPTR_MAX__
+#define PTRDIFF_MIN (-__PTRDIFF_MAX__-1)
+#define PTRDIFF_MAX   __PTRDIFF_MAX__
+#define    SIZE_MAX      __SIZE_MAX__
 
 /* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
  * is enabled. */
@@ -673,9 +670,9 @@
 #endif
 
 /* C99 7.18.2.5 Limits of greatest-width integer types. */
-#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
-#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
-#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+#define  INTMAX_MIN (-__INTMAX_MAX__-1)
+#define  INTMAX_MAX   __INTMAX_MAX__
+#define UINTMAX_MAX  __UINTMAX_MAX__
 
 /* C99 7.18.3 Limits of other integer types. */
 #define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
@@ -700,8 +697,8 @@
 #endif
 
 /* 7.18.4.2 Macros for greatest-width integer constants. */
-#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
-#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+#define  INTMAX_C(v) __int_c(v,  __INTMAX_C_SUFFIX__)
+#define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)
 
 #endif /* __STDC_HOSTED__ */
 #endif /* __CLANG_STDINT_H */
diff --git a/darwin-x86/clang-headers/tgmath.h b/darwin-x86/clang-headers/tgmath.h
index 318e118..34e26dc 100644
--- a/darwin-x86/clang-headers/tgmath.h
+++ b/darwin-x86/clang-headers/tgmath.h
@@ -22,12 +22,21 @@
  *
 \*===----------------------------------------------------------------------===*/
 
-#ifndef __TGMATH_H
-#define __TGMATH_H
+#ifndef __CLANG_TGMATH_H
+#define __CLANG_TGMATH_H
 
 /* C99 7.22 Type-generic math <tgmath.h>. */
 #include <math.h>
 
+/*
+ * Allow additional definitions and implementation-defined values on Apple
+ * platforms. This is done after #include <math.h> to avoid depcycle conflicts
+ * between libcxx and darwin in C++ modules builds.
+ */
+#if defined(__APPLE__) && __STDC_HOSTED__ && __has_include_next(<tgmath.h>)
+#  include_next <tgmath.h>
+#else
+
 /* C++ handles type genericity with overloading in math.h. */
 #ifndef __cplusplus
 #include <complex.h>
@@ -1371,4 +1380,5 @@
 #undef _TG_ATTRS
 
 #endif /* __cplusplus */
-#endif /* __TGMATH_H */
+#endif /* __has_include_next */
+#endif /* __CLANG_TGMATH_H */
diff --git a/darwin-x86/clang-headers/tmmintrin.h b/darwin-x86/clang-headers/tmmintrin.h
index a72796b..734cd39 100644
--- a/darwin-x86/clang-headers/tmmintrin.h
+++ b/darwin-x86/clang-headers/tmmintrin.h
@@ -27,9 +27,10 @@
 #include <pmmintrin.h>
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
 
-/// \brief Computes the absolute value of each of the packed 8-bit signed
+/// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
 ///    results in the destination.
 ///
@@ -41,13 +42,13 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_abs_pi8(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
 }
 
-/// \brief Computes the absolute value of each of the packed 8-bit signed
+/// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
 ///    results in the destination.
 ///
@@ -65,7 +66,7 @@
     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
 }
 
-/// \brief Computes the absolute value of each of the packed 16-bit signed
+/// Computes the absolute value of each of the packed 16-bit signed
 ///    integers in the source operand and stores the 16-bit unsigned integer
 ///    results in the destination.
 ///
@@ -77,13 +78,13 @@
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_abs_pi16(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
 }
 
-/// \brief Computes the absolute value of each of the packed 16-bit signed
+/// Computes the absolute value of each of the packed 16-bit signed
 ///    integers in the source operand and stores the 16-bit unsigned integer
 ///    results in the destination.
 ///
@@ -101,7 +102,7 @@
     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
 }
 
-/// \brief Computes the absolute value of each of the packed 32-bit signed
+/// Computes the absolute value of each of the packed 32-bit signed
 ///    integers in the source operand and stores the 32-bit unsigned integer
 ///    results in the destination.
 ///
@@ -113,13 +114,13 @@
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_abs_pi32(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 }
 
-/// \brief Computes the absolute value of each of the packed 32-bit signed
+/// Computes the absolute value of each of the packed 32-bit signed
 ///    integers in the source operand and stores the 32-bit unsigned integer
 ///    results in the destination.
 ///
@@ -137,7 +138,7 @@
     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 }
 
-/// \brief Concatenates the two 128-bit integer vector operands, and
+/// Concatenates the two 128-bit integer vector operands, and
 ///    right-shifts the result by the number of bytes specified in the immediate
 ///    operand.
 ///
@@ -157,11 +158,11 @@
 ///    An immediate operand specifying how many bytes to right-shift the result.
 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 ///    value.
-#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+#define _mm_alignr_epi8(a, b, n) \
   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
-                                     (__v16qi)(__m128i)(b), (n)); })
+                                     (__v16qi)(__m128i)(b), (n))
 
-/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
+/// Concatenates the two 64-bit integer vector operands, and right-shifts
 ///    the result by the number of bytes specified in the immediate operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -180,10 +181,10 @@
 ///    An immediate operand specifying how many bytes to right-shift the result.
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
-#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
-  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
+#define _mm_alignr_pi8(a, b, n) \
+  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
 
-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
 ///
 /// \headerfile <x86intrin.h>
@@ -206,7 +207,7 @@
     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [4 x i32].
 ///
 /// \headerfile <x86intrin.h>
@@ -229,7 +230,7 @@
     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }
 
-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    64-bit vectors of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
@@ -246,13 +247,13 @@
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    64-bit vectors of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
@@ -269,15 +270,16 @@
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }
 
-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
-///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
-///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
+///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
+///    0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -299,9 +301,10 @@
     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
-///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
-///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
+///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
+///    0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -317,13 +320,13 @@
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 128-bit vectors of [8 x i16].
 ///
 /// \headerfile <x86intrin.h>
@@ -346,7 +349,7 @@
     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 128-bit vectors of [4 x i32].
 ///
 /// \headerfile <x86intrin.h>
@@ -369,7 +372,7 @@
     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 64-bit vectors of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
@@ -386,13 +389,13 @@
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 64-bit vectors of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
@@ -409,16 +412,16 @@
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
-///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
-///    saturated to 8000h.
+///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
+///    saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -440,10 +443,10 @@
     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
-///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
-///    saturated to 8000h.
+///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
+///    saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -459,20 +462,21 @@
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
 ///    values contained in the first source operand and packed 8-bit signed
 ///    integer values contained in the second source operand, adds pairs of
 ///    contiguous products with signed saturation, and writes the 16-bit sums to
-///    the corresponding bits in the destination. For example, bits [7:0] of
-///    both operands are multiplied, bits [15:8] of both operands are
-///    multiplied, and the sum of both results is written to bits [15:0] of the
-///    destination.
+///    the corresponding bits in the destination.
+///
+///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
+///    both operands are multiplied, and the sum of both results is written to
+///    bits [15:0] of the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -483,29 +487,30 @@
 /// \param __b
 ///    A 128-bit integer vector containing the second source operand.
 /// \returns A 128-bit integer vector containing the sums of products of both
-///    operands:
-///    R0 := (__a0 * __b0) + (__a1 * __b1)
-///    R1 := (__a2 * __b2) + (__a3 * __b3)
-///    R2 := (__a4 * __b4) + (__a5 * __b5)
-///    R3 := (__a6 * __b6) + (__a7 * __b7)
-///    R4 := (__a8 * __b8) + (__a9 * __b9)
-///    R5 := (__a10 * __b10) + (__a11 * __b11)
-///    R6 := (__a12 * __b12) + (__a13 * __b13)
-///    R7 := (__a14 * __b14) + (__a15 * __b15)
+///    operands: \n
+///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
+///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
+///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
+///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
+///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
-/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
 ///    values contained in the first source operand and packed 8-bit signed
 ///    integer values contained in the second source operand, adds pairs of
 ///    contiguous products with signed saturation, and writes the 16-bit sums to
-///    the corresponding bits in the destination. For example, bits [7:0] of
-///    both operands are multiplied, bits [15:8] of both operands are
-///    multiplied, and the sum of both results is written to bits [15:0] of the
-///    destination.
+///    the corresponding bits in the destination.
+///
+///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
+///    both operands are multiplied, and the sum of both results is written to
+///    bits [15:0] of the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -516,18 +521,18 @@
 /// \param __b
 ///    A 64-bit integer vector containing the second source operand.
 /// \returns A 64-bit integer vector containing the sums of products of both
-///    operands:
-///    R0 := (__a0 * __b0) + (__a1 * __b1)
-///    R1 := (__a2 * __b2) + (__a3 * __b3)
-///    R2 := (__a4 * __b4) + (__a5 * __b5)
-///    R3 := (__a6 * __b6) + (__a7 * __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+///    operands: \n
+///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 }
 
-/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 ///    products to the 18 most significant bits by right-shifting, rounds the
 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 ///
@@ -547,7 +552,7 @@
     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 ///    products to the 18 most significant bits by right-shifting, rounds the
 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 ///
@@ -561,13 +566,13 @@
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
+/// Copies the 8-bit integers from a 128-bit integer vector to the
 ///    destination or clears 8-bit values in the destination, as specified by
 ///    the second source operand.
 ///
@@ -580,11 +585,11 @@
 /// \param __b
 ///    A 128-bit integer vector containing control bytes corresponding to
 ///    positions in the destination:
-///    Bit 7:
-///    1: Clear the corresponding byte in the destination.
+///    Bit 7: \n
+///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
-///    destination.
-///    Bits [6:4] Reserved.
+///    destination. \n
+///    Bits [6:4] Reserved.  \n
 ///    Bits [3:0] select the source byte to be copied.
 /// \returns A 128-bit integer vector containing the copied or cleared values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -593,7 +598,7 @@
     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 }
 
-/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
+/// Copies the 8-bit integers from a 64-bit integer vector to the
 ///    destination or clears 8-bit values in the destination, as specified by
 ///    the second source operand.
 ///
@@ -606,26 +611,27 @@
 /// \param __b
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination:
-///    Bit 7:
-///    1: Clear the corresponding byte in the destination.
+///    Bit 7: \n
+///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
-///    destination.
+///    destination. \n
 ///    Bits [3:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 }
 
-/// \brief For each 8-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    byte in the second source is negative, calculate the two's complement of
-///    the corresponding byte in the first source, and write that value to the
-///    destination. If the byte in the second source is positive, copy the
-///    corresponding byte from the first source to the destination. If the byte
-///    in the second source is zero, clear the corresponding byte in the
-///    destination.
+/// For each 8-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand.
+///
+///    If the byte in the second source is negative, calculate the two's
+///    complement of the corresponding byte in the first source, and write that
+///    value to the destination. If the byte in the second source is positive,
+///    copy the corresponding byte from the first source to the destination. If
+///    the byte in the second source is zero, clear the corresponding byte in
+///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -643,14 +649,15 @@
     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 }
 
-/// \brief For each 16-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    word in the second source is negative, calculate the two's complement of
-///    the corresponding word in the first source, and write that value to the
-///    destination. If the word in the second source is positive, copy the
-///    corresponding word from the first source to the destination. If the word
-///    in the second source is zero, clear the corresponding word in the
-///    destination.
+/// For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand.
+///
+///    If the word in the second source is negative, calculate the two's
+///    complement of the corresponding word in the first source, and write that
+///    value to the destination. If the word in the second source is positive,
+///    copy the corresponding word from the first source to the destination. If
+///    the word in the second source is zero, clear the corresponding word in
+///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -668,9 +675,10 @@
     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// \brief For each 32-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    doubleword in the second source is negative, calculate the two's
+/// For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand.
+///
+///    If the doubleword in the second source is negative, calculate the two's
 ///    complement of the corresponding word in the first source, and write that
 ///    value to the destination. If the doubleword in the second source is
 ///    positive, copy the corresponding word from the first source to the
@@ -693,14 +701,15 @@
     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 }
 
-/// \brief For each 8-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    byte in the second source is negative, calculate the two's complement of
-///    the corresponding byte in the first source, and write that value to the
-///    destination. If the byte in the second source is positive, copy the
-///    corresponding byte from the first source to the destination. If the byte
-///    in the second source is zero, clear the corresponding byte in the
-///    destination.
+/// For each 8-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand.
+///
+///    If the byte in the second source is negative, calculate the two's
+///    complement of the corresponding byte in the first source, and write that
+///    value to the destination. If the byte in the second source is positive,
+///    copy the corresponding byte from the first source to the destination. If
+///    the byte in the second source is zero, clear the corresponding byte in
+///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -712,20 +721,21 @@
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 }
 
-/// \brief For each 16-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    word in the second source is negative, calculate the two's complement of
-///    the corresponding word in the first source, and write that value to the
-///    destination. If the word in the second source is positive, copy the
-///    corresponding word from the first source to the destination. If the word
-///    in the second source is zero, clear the corresponding word in the
-///    destination.
+/// For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand.
+///
+///    If the word in the second source is negative, calculate the two's
+///    complement of the corresponding word in the first source, and write that
+///    value to the destination. If the word in the second source is positive,
+///    copy the corresponding word from the first source to the destination. If
+///    the word in the second source is zero, clear the corresponding word in
+///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -737,15 +747,16 @@
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief For each 32-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    doubleword in the second source is negative, calculate the two's
+/// For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand.
+///
+///    If the doubleword in the second source is negative, calculate the two's
 ///    complement of the corresponding doubleword in the first source, and
 ///    write that value to the destination. If the doubleword in the second
 ///    source is positive, copy the corresponding doubleword from the first
@@ -762,12 +773,13 @@
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_MMX
 
 #endif /* __TMMINTRIN_H */
diff --git a/darwin-x86/clang-headers/unwind.h b/darwin-x86/clang-headers/unwind.h
index 4f74a34..0e8317e 100644
--- a/darwin-x86/clang-headers/unwind.h
+++ b/darwin-x86/clang-headers/unwind.h
@@ -76,7 +76,13 @@
 typedef uintptr_t _uleb128_t;
 
 struct _Unwind_Context;
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+struct _Unwind_Control_Block;
+typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
+#else
 struct _Unwind_Exception;
+typedef struct _Unwind_Exception _Unwind_Exception;
+#endif
 typedef enum {
   _URC_NO_REASON = 0,
 #if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
@@ -109,34 +115,73 @@
 } _Unwind_Action;
 
 typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
-                                             struct _Unwind_Exception *);
+                                             _Unwind_Exception *);
 
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+typedef struct _Unwind_Control_Block _Unwind_Control_Block;
+typedef uint32_t _Unwind_EHT_Header;
+
+struct _Unwind_Control_Block {
+  uint64_t exception_class;
+  void (*exception_cleanup)(_Unwind_Reason_Code, _Unwind_Control_Block *);
+  /* unwinder cache (private fields for the unwinder's use) */
+  struct {
+    uint32_t reserved1; /* forced unwind stop function, 0 if not forced */
+    uint32_t reserved2; /* personality routine */
+    uint32_t reserved3; /* callsite */
+    uint32_t reserved4; /* forced unwind stop argument */
+    uint32_t reserved5;
+  } unwinder_cache;
+  /* propagation barrier cache (valid after phase 1) */
+  struct {
+    uint32_t sp;
+    uint32_t bitpattern[5];
+  } barrier_cache;
+  /* cleanup cache (preserved over cleanup) */
+  struct {
+    uint32_t bitpattern[4];
+  } cleanup_cache;
+  /* personality cache (for personality's benefit) */
+  struct {
+    uint32_t fnstart;         /* function start address */
+    _Unwind_EHT_Header *ehtp; /* pointer to EHT entry header word */
+    uint32_t additional;      /* additional data */
+    uint32_t reserved1;
+  } pr_cache;
+  long long int : 0; /* force alignment of next item to 8-byte boundary */
+} __attribute__((__aligned__(8)));
+#else
 struct _Unwind_Exception {
   _Unwind_Exception_Class exception_class;
   _Unwind_Exception_Cleanup_Fn exception_cleanup;
+#if !defined (__USING_SJLJ_EXCEPTIONS__) && defined (__SEH__)
+  _Unwind_Word private_[6];
+#else
   _Unwind_Word private_1;
   _Unwind_Word private_2;
+#endif
   /* The Itanium ABI requires that _Unwind_Exception objects are "double-word
    * aligned".  GCC has interpreted this to mean "use the maximum useful
    * alignment for the target"; so do we. */
 } __attribute__((__aligned__));
+#endif
 
 typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
                                                _Unwind_Exception_Class,
-                                               struct _Unwind_Exception *,
+                                               _Unwind_Exception *,
                                                struct _Unwind_Context *,
                                                void *);
 
-typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
-    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
-    struct _Unwind_Context *);
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(int, _Unwind_Action,
+                                                      _Unwind_Exception_Class,
+                                                      _Unwind_Exception *,
+                                                      struct _Unwind_Context *);
 typedef _Unwind_Personality_Fn __personality_routine;
 
 typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
                                                 void *);
 
-#if defined(__arm__) && !defined(__APPLE__)
-
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
 typedef enum {
   _UVRSC_CORE = 0,        /* integer register */
   _UVRSC_VFP = 1,         /* vfp */
@@ -158,14 +203,12 @@
   _UVRSR_FAILED = 2
 } _Unwind_VRS_Result;
 
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__)
 typedef uint32_t _Unwind_State;
 #define _US_VIRTUAL_UNWIND_FRAME  ((_Unwind_State)0)
 #define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1)
 #define _US_UNWIND_FRAME_RESUME   ((_Unwind_State)2)
 #define _US_ACTION_MASK           ((_Unwind_State)3)
 #define _US_FORCE_UNWIND          ((_Unwind_State)8)
-#endif
 
 _Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
   _Unwind_VRS_RegClass __regclass,
@@ -224,13 +267,12 @@
 
 /* DWARF EH functions; currently not available on Darwin/ARM */
 #if !defined(__APPLE__) || !defined(__arm__)
-
-_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
-                                         _Unwind_Stop_Fn, void *);
-void _Unwind_DeleteException(struct _Unwind_Exception *);
-void _Unwind_Resume(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(_Unwind_Exception *, _Unwind_Stop_Fn,
+                                         void *);
+void _Unwind_DeleteException(_Unwind_Exception *);
+void _Unwind_Resume(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(_Unwind_Exception *);
 
 #endif
 
@@ -241,11 +283,11 @@
 
 void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
 void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
-_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(_Unwind_Exception *,
                                               _Unwind_Stop_Fn, void *);
-void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+void _Unwind_SjLj_Resume(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(_Unwind_Exception *);
 
 void *_Unwind_FindEnclosingFunction(void *);
 
diff --git a/darwin-x86/clang-headers/vaesintrin.h b/darwin-x86/clang-headers/vaesintrin.h
new file mode 100644
index 0000000..e4174bb
--- /dev/null
+++ b/darwin-x86/clang-headers/vaesintrin.h
@@ -0,0 +1,98 @@
+/*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VAESINTRIN_H
+#define __VAESINTRIN_H
+
+/* Default attributes for YMM forms. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256)))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512)))
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesenc_epi128(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_aesenc256((__v4di) __A,
+              (__v4di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenc_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesdec_epi128(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_aesdec256((__v4di) __A,
+              (__v4di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesdec_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesenclast_epi128(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A,
+              (__v4di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A,
+              (__v4di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
+              (__v8di) __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_F
+
+#endif
diff --git a/darwin-x86/clang-headers/vecintrin.h b/darwin-x86/clang-headers/vecintrin.h
index ca7acb4..f7061e8 100644
--- a/darwin-x86/clang-headers/vecintrin.h
+++ b/darwin-x86/clang-headers/vecintrin.h
@@ -116,6 +116,13 @@
   return __vec[__index & 1];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai float
+vec_extract(vector float __vec, int __index) {
+  return __vec[__index & 3];
+}
+#endif
+
 static inline __ATTRS_o_ai double
 vec_extract(vector double __vec, int __index) {
   return __vec[__index & 1];
@@ -129,6 +136,7 @@
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_insert(unsigned char __scalar, vector bool char __vec, int __index) {
   vector unsigned char __newvec = (vector unsigned char)__vec;
@@ -148,6 +156,7 @@
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_insert(unsigned short __scalar, vector bool short __vec, int __index) {
   vector unsigned short __newvec = (vector unsigned short)__vec;
@@ -167,6 +176,7 @@
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_insert(unsigned int __scalar, vector bool int __vec, int __index) {
   vector unsigned int __newvec = (vector unsigned int)__vec;
@@ -187,6 +197,7 @@
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_insert(unsigned long long __scalar, vector bool long long __vec,
            int __index) {
@@ -202,6 +213,14 @@
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_insert(float __scalar, vector float __vec, int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_insert(double __scalar, vector double __vec, int __index) {
   __vec[__index & 1] = __scalar;
@@ -282,6 +301,16 @@
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_promote(float __scalar, int __index) {
+  const vector float __zero = (vector float)0;
+  vector float __vec = __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_promote(double __scalar, int __index) {
   const vector double __zero = (vector double)0;
@@ -348,6 +377,15 @@
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_insert_and_zero(const float *__ptr) {
+  vector float __vec = (vector float)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_insert_and_zero(const double *__ptr) {
   vector double __vec = (vector double)0;
@@ -441,6 +479,15 @@
            (vector unsigned char)__a, (vector unsigned char)__b, __c);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_perm(vector float __a, vector float __b,
+         vector unsigned char __c) {
+  return (vector float)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_perm(vector double __a, vector double __b,
          vector unsigned char __c) {
@@ -450,18 +497,22 @@
 
 /*-- vec_permi --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 extern __ATTRS_o vector signed long long
 vec_permi(vector signed long long __a, vector signed long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector unsigned long long
 vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector bool long long
 vec_permi(vector bool long long __a, vector bool long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector double
 vec_permi(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 3);
@@ -471,6 +522,15 @@
                       (vector unsigned long long)(Y), \
                       (((Z) & 2) << 1) | ((Z) & 1)))
 
+/*-- vec_bperm_u128 ---------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector unsigned long long
+vec_bperm_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vbperm(__a, __b);
+}
+#endif
+
 /*-- vec_sel ----------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed char
@@ -614,6 +674,22 @@
           (~(vector unsigned long long)__c & __a));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_sel(vector float __a, vector float __b, vector unsigned int __c) {
+  return (vector float)((__c & (vector unsigned int)__b) |
+                        (~__c & (vector unsigned int)__a));
+}
+
+static inline __ATTRS_o_ai vector float
+vec_sel(vector float __a, vector float __b, vector bool int __c) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  vector unsigned int __bc = (vector unsigned int)__b;
+  vector unsigned int __cc = (vector unsigned int)__c;
+  return (vector float)((__cc & __bc) | (~__cc & __ac));
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
   return (vector double)((__c & (vector unsigned long long)__b) |
@@ -687,6 +763,17 @@
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_gather_element(vector float __vec, vector unsigned int __offset,
+                   const float *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const float *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_gather_element(vector double __vec, vector unsigned long long __offset,
                    const double *__ptr, int __index)
@@ -749,6 +836,16 @@
     __vec[__index];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector float __vec, vector unsigned int __offset,
+                    float *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(float *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+#endif
+
 static inline __ATTRS_o_ai void
 vec_scatter_element(vector double __vec, vector unsigned long long __offset,
                     double *__ptr, int __index)
@@ -757,48 +854,111 @@
     __vec[__index];
 }
 
+/*-- vec_xl -----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xl(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xl(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xl(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xl(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xl(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xl(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_xl(long __offset, const signed long long *__ptr) {
+  return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xl(long __offset, const unsigned long long *__ptr) {
+  return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_xl(long __offset, const float *__ptr) {
+  return *(const vector float *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
+vec_xl(long __offset, const double *__ptr) {
+  return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
 /*-- vec_xld2 ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_xld2(long __offset, const signed char *__ptr) {
   return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_xld2(long __offset, const unsigned char *__ptr) {
   return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_xld2(long __offset, const signed short *__ptr) {
   return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_xld2(long __offset, const unsigned short *__ptr) {
   return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_xld2(long __offset, const signed int *__ptr) {
   return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_xld2(long __offset, const unsigned int *__ptr) {
   return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_xld2(long __offset, const signed long long *__ptr) {
   return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_xld2(long __offset, const unsigned long long *__ptr) {
   return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_xld2(long __offset, const double *__ptr) {
   return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
@@ -806,74 +966,145 @@
 
 /*-- vec_xlw4 ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_xlw4(long __offset, const signed char *__ptr) {
   return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_xlw4(long __offset, const unsigned char *__ptr) {
   return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_xlw4(long __offset, const signed short *__ptr) {
   return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_xlw4(long __offset, const unsigned short *__ptr) {
   return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_xlw4(long __offset, const signed int *__ptr) {
   return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_xlw4(long __offset, const unsigned int *__ptr) {
   return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+/*-- vec_xst ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed long long __vec, long __offset,
+          signed long long *__ptr) {
+  *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned long long __vec, long __offset,
+          unsigned long long *__ptr) {
+  *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) =
+    __vec;
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_xst(vector float __vec, long __offset, float *__ptr) {
+  *(vector float *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+#endif
+
+static inline __ATTRS_o_ai void
+vec_xst(vector double __vec, long __offset, double *__ptr) {
+  *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
 /*-- vec_xstd2 --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) {
   *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
   *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) {
   *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
   *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) {
   *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
   *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed long long __vec, long __offset,
           signed long long *__ptr) {
   *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned long long __vec, long __offset,
           unsigned long long *__ptr) {
@@ -881,6 +1112,7 @@
     __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector double __vec, long __offset, double *__ptr) {
   *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
@@ -888,31 +1120,37 @@
 
 /*-- vec_xstw4 --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) {
   *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
   *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) {
   *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
   *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) {
   *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
   *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
@@ -952,6 +1190,12 @@
 vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
   __constant_pow2_range(__len, 64, 4096);
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector float
+vec_load_bndry(const float *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+#endif
+
 extern __ATTRS_o vector double
 vec_load_bndry(const double *__ptr, unsigned short __len)
   __constant_pow2_range(__len, 64, 4096);
@@ -1007,11 +1251,27 @@
   return (vector unsigned long long)__builtin_s390_vll(__len, __ptr);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_load_len(const float *__ptr, unsigned int __len) {
+  return (vector float)__builtin_s390_vll(__len, __ptr);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_load_len(const double *__ptr, unsigned int __len) {
   return (vector double)__builtin_s390_vll(__len, __ptr);
 }
 
+/*-- vec_load_len_r ---------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector unsigned char
+vec_load_len_r(const unsigned char *__ptr, unsigned int __len) {
+  return (vector unsigned char)__builtin_s390_vlrl(__len, __ptr);
+}
+#endif
+
 /*-- vec_store_len ----------------------------------------------------------*/
 
 static inline __ATTRS_o_ai void
@@ -1062,12 +1322,30 @@
   __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_store_len(vector float __vec, float *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+#endif
+
 static inline __ATTRS_o_ai void
 vec_store_len(vector double __vec, double *__ptr,
               unsigned int __len) {
   __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
 }
 
+/*-- vec_store_len_r --------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai void
+vec_store_len_r(vector unsigned char __vec, unsigned char *__ptr,
+                unsigned int __len) {
+  __builtin_s390_vstrl((vector signed char)__vec, __len, __ptr);
+}
+#endif
+
 /*-- vec_load_pair ----------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed long long
@@ -1232,6 +1510,14 @@
   return (vector unsigned long long)__vec[__index];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_splat(vector float __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector float)__vec[__index];
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_splat(vector double __vec, int __index)
   __constant_range(__index, 0, 1) {
@@ -1332,6 +1618,13 @@
   return (vector unsigned long long)__scalar;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_splats(float __scalar) {
+  return (vector float)__scalar;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_splats(double __scalar) {
   return (vector double)__scalar;
@@ -1425,6 +1718,13 @@
   return (vector unsigned long long)(__a[0], __b[0]);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_mergeh(vector float __a, vector float __b) {
+  return (vector float)(__a[0], __b[0], __a[1], __b[1]);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_mergeh(vector double __a, vector double __b) {
   return (vector double)(__a[0], __b[0]);
@@ -1501,6 +1801,13 @@
   return (vector unsigned long long)(__a[1], __b[1]);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_mergel(vector float __a, vector float __b) {
+  return (vector float)(__a[2], __b[2], __a[3], __b[3]);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_mergel(vector double __a, vector double __b) {
   return (vector double)(__a[1], __b[1]);
@@ -1866,6 +2173,13 @@
   return (vector bool long long)(__a == __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector float __a, vector float __b) {
+  return (vector bool int)(__a == __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpeq(vector double __a, vector double __b) {
   return (vector bool long long)(__a == __b);
@@ -1913,6 +2227,13 @@
   return (vector bool long long)(__a >= __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector float __a, vector float __b) {
+  return (vector bool int)(__a >= __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpge(vector double __a, vector double __b) {
   return (vector bool long long)(__a >= __b);
@@ -1960,6 +2281,13 @@
   return (vector bool long long)(__a > __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector float __a, vector float __b) {
+  return (vector bool int)(__a > __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpgt(vector double __a, vector double __b) {
   return (vector bool long long)(__a > __b);
@@ -2007,6 +2335,13 @@
   return (vector bool long long)(__a <= __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector float __a, vector float __b) {
+  return (vector bool int)(__a <= __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmple(vector double __a, vector double __b) {
   return (vector bool long long)(__a <= __b);
@@ -2054,6 +2389,13 @@
   return (vector bool long long)(__a < __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector float __a, vector float __b) {
+  return (vector bool int)(__a < __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmplt(vector double __a, vector double __b) {
   return (vector bool long long)(__a < __b);
@@ -2068,6 +2410,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2075,6 +2418,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2090,6 +2434,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2098,6 +2443,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2121,6 +2467,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2128,6 +2475,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2143,6 +2491,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2151,6 +2500,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2174,6 +2524,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2181,6 +2532,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2196,6 +2548,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2204,6 +2557,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2227,6 +2581,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2234,6 +2589,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2249,6 +2605,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2257,6 +2614,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2273,6 +2631,15 @@
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_eq(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_eq(vector double __a, vector double __b) {
   int __cc;
@@ -2289,6 +2656,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2296,6 +2664,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2311,6 +2680,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2319,6 +2689,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2342,6 +2713,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2349,6 +2721,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2364,6 +2737,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2372,6 +2746,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2395,6 +2770,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2402,6 +2778,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2417,6 +2794,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2425,6 +2803,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2448,6 +2827,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2455,6 +2835,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2470,6 +2851,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2478,6 +2860,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2494,6 +2877,15 @@
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ne(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_ne(vector double __a, vector double __b) {
   int __cc;
@@ -2510,6 +2902,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2517,6 +2910,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2531,6 +2925,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2538,6 +2933,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2545,6 +2941,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2560,6 +2957,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2567,6 +2965,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2581,6 +2980,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2588,6 +2988,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2595,6 +2996,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -2610,6 +3012,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2617,6 +3020,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2631,6 +3035,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2638,6 +3043,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2645,6 +3051,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -2660,6 +3067,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2667,6 +3075,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2681,6 +3090,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2688,6 +3098,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2695,6 +3106,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -2703,6 +3115,15 @@
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_ge(vector double __a, vector double __b) {
   int __cc;
@@ -2719,6 +3140,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2726,6 +3148,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2740,6 +3163,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2747,6 +3171,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2754,6 +3179,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2769,6 +3195,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2776,6 +3203,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2790,6 +3218,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2797,6 +3226,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2804,6 +3234,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -2819,6 +3250,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2826,6 +3258,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2840,6 +3273,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2847,6 +3281,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2854,6 +3289,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -2869,6 +3305,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2876,6 +3313,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2890,6 +3328,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2897,6 +3336,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2904,6 +3344,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -2912,6 +3353,15 @@
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_gt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_gt(vector double __a, vector double __b) {
   int __cc;
@@ -2928,6 +3378,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2935,6 +3386,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2949,6 +3401,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2956,6 +3409,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2963,6 +3417,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2978,6 +3433,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2985,6 +3441,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2999,6 +3456,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3006,6 +3464,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3013,6 +3472,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3028,6 +3488,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3035,6 +3496,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3049,6 +3511,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3056,6 +3519,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3063,6 +3527,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3078,6 +3543,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3085,6 +3551,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3099,6 +3566,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3106,6 +3574,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3113,6 +3582,7 @@
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -3121,6 +3591,15 @@
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_le(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_le(vector double __a, vector double __b) {
   int __cc;
@@ -3137,6 +3616,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3144,6 +3624,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3158,6 +3639,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3165,6 +3647,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3172,6 +3655,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -3187,6 +3671,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3194,6 +3679,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3208,6 +3694,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3215,6 +3702,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3222,6 +3710,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3237,6 +3726,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3244,6 +3734,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3258,6 +3749,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3265,6 +3757,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3272,6 +3765,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3287,6 +3781,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3294,6 +3789,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3308,6 +3804,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3315,6 +3812,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3322,6 +3820,7 @@
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -3330,6 +3829,15 @@
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_lt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_lt(vector double __a, vector double __b) {
   int __cc;
@@ -3339,7 +3847,16 @@
 
 /*-- vec_all_nge ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nge(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__a, __b, &__cc);
@@ -3348,7 +3865,16 @@
 
 /*-- vec_all_ngt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ngt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_ngt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__a, __b, &__cc);
@@ -3357,7 +3883,16 @@
 
 /*-- vec_all_nle ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nle(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nle(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__b, __a, &__cc);
@@ -3366,7 +3901,16 @@
 
 /*-- vec_all_nlt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nlt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nlt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__b, __a, &__cc);
@@ -3375,7 +3919,16 @@
 
 /*-- vec_all_nan ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nan(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc == 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nan(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -3384,7 +3937,16 @@
 
 /*-- vec_all_numeric --------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_numeric(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_numeric(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -3400,6 +3962,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3407,6 +3970,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3422,6 +3986,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3430,6 +3995,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3453,6 +4019,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3460,6 +4027,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3475,6 +4043,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3483,6 +4052,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3506,6 +4076,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3513,6 +4084,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3528,6 +4100,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3536,6 +4109,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3559,6 +4133,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3566,6 +4141,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3581,6 +4157,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3589,6 +4166,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3605,6 +4183,15 @@
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_eq(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_eq(vector double __a, vector double __b) {
   int __cc;
@@ -3621,6 +4208,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3628,6 +4216,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3643,6 +4232,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3651,6 +4241,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3674,6 +4265,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3681,6 +4273,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3696,6 +4289,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3704,6 +4298,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3727,6 +4322,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3734,6 +4330,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3749,6 +4346,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3757,6 +4355,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3780,6 +4379,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3787,6 +4387,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3802,6 +4403,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3810,6 +4412,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3826,6 +4429,15 @@
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ne(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_ne(vector double __a, vector double __b) {
   int __cc;
@@ -3842,6 +4454,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3849,6 +4462,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3863,6 +4477,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3870,6 +4485,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3877,6 +4493,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -3892,6 +4509,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3899,6 +4517,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3913,6 +4532,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3920,6 +4540,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3927,6 +4548,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3942,6 +4564,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3949,6 +4572,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3963,6 +4587,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3970,6 +4595,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3977,6 +4603,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3992,6 +4619,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3999,6 +4627,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4013,6 +4642,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4020,6 +4650,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4027,6 +4658,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4035,6 +4667,15 @@
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_ge(vector double __a, vector double __b) {
   int __cc;
@@ -4051,6 +4692,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4058,6 +4700,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4072,6 +4715,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4079,6 +4723,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4086,6 +4731,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4101,6 +4747,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4108,6 +4755,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4122,6 +4770,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4129,6 +4778,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4136,6 +4786,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4151,6 +4802,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4158,6 +4810,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4172,6 +4825,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4179,6 +4833,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4186,6 +4841,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4201,6 +4857,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4208,6 +4865,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4222,6 +4880,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4229,6 +4888,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4236,6 +4896,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4244,6 +4905,15 @@
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_gt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_gt(vector double __a, vector double __b) {
   int __cc;
@@ -4260,6 +4930,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4267,6 +4938,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4281,6 +4953,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4288,6 +4961,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4295,6 +4969,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4310,6 +4985,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4317,6 +4993,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4331,6 +5008,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4338,6 +5016,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4345,6 +5024,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4360,6 +5040,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4367,6 +5048,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4381,6 +5063,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4388,6 +5071,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4395,6 +5079,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4410,6 +5095,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4417,6 +5103,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4431,6 +5118,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4438,6 +5126,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4445,6 +5134,7 @@
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4453,6 +5143,15 @@
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_le(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_le(vector double __a, vector double __b) {
   int __cc;
@@ -4469,6 +5168,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4476,6 +5176,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4490,6 +5191,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4497,6 +5199,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4504,6 +5207,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4519,6 +5223,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4526,6 +5231,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4540,6 +5246,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4547,6 +5254,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4554,6 +5262,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4569,6 +5278,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4576,6 +5286,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4590,6 +5301,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4597,6 +5309,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4604,6 +5317,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4619,6 +5333,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4626,6 +5341,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4640,6 +5356,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4647,6 +5364,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4654,6 +5372,7 @@
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4662,6 +5381,15 @@
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_lt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_lt(vector double __a, vector double __b) {
   int __cc;
@@ -4671,7 +5399,16 @@
 
 /*-- vec_any_nge ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nge(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__a, __b, &__cc);
@@ -4680,7 +5417,16 @@
 
 /*-- vec_any_ngt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ngt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_ngt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__a, __b, &__cc);
@@ -4689,7 +5435,16 @@
 
 /*-- vec_any_nle ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nle(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nle(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__b, __a, &__cc);
@@ -4698,7 +5453,16 @@
 
 /*-- vec_any_nlt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nlt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nlt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__b, __a, &__cc);
@@ -4707,7 +5471,16 @@
 
 /*-- vec_any_nan ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nan(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc != 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nan(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -4716,7 +5489,16 @@
 
 /*-- vec_any_numeric --------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_numeric(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_numeric(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -4735,11 +5517,13 @@
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_andc(vector bool char __a, vector signed char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_andc(vector signed char __a, vector bool char __b) {
   return __a & ~__b;
@@ -4750,11 +5534,13 @@
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_andc(vector bool char __a, vector unsigned char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_andc(vector unsigned char __a, vector bool char __b) {
   return __a & ~__b;
@@ -4770,11 +5556,13 @@
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_andc(vector bool short __a, vector signed short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_andc(vector signed short __a, vector bool short __b) {
   return __a & ~__b;
@@ -4785,11 +5573,13 @@
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_andc(vector bool short __a, vector unsigned short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_andc(vector unsigned short __a, vector bool short __b) {
   return __a & ~__b;
@@ -4805,11 +5595,13 @@
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_andc(vector bool int __a, vector signed int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_andc(vector signed int __a, vector bool int __b) {
   return __a & ~__b;
@@ -4820,11 +5612,13 @@
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_andc(vector bool int __a, vector unsigned int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_andc(vector unsigned int __a, vector bool int __b) {
   return __a & ~__b;
@@ -4840,11 +5634,13 @@
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_andc(vector bool long long __a, vector signed long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_andc(vector signed long long __a, vector bool long long __b) {
   return __a & ~__b;
@@ -4855,28 +5651,40 @@
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_andc(vector bool long long __a, vector unsigned long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_andc(vector unsigned long long __a, vector bool long long __b) {
   return __a & ~__b;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_andc(vector float __a, vector float __b) {
+  return (vector float)((vector unsigned int)__a &
+                         ~(vector unsigned int)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_andc(vector double __a, vector double __b) {
   return (vector double)((vector unsigned long long)__a &
                          ~(vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_andc(vector bool long long __a, vector double __b) {
   return (vector double)((vector unsigned long long)__a &
                          ~(vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_andc(vector double __a, vector bool long long __b) {
   return (vector double)((vector unsigned long long)__a &
@@ -4895,11 +5703,13 @@
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_nor(vector bool char __a, vector signed char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_nor(vector signed char __a, vector bool char __b) {
   return ~(__a | __b);
@@ -4910,11 +5720,13 @@
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_nor(vector bool char __a, vector unsigned char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_nor(vector unsigned char __a, vector bool char __b) {
   return ~(__a | __b);
@@ -4930,11 +5742,13 @@
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_nor(vector bool short __a, vector signed short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_nor(vector signed short __a, vector bool short __b) {
   return ~(__a | __b);
@@ -4945,11 +5759,13 @@
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_nor(vector bool short __a, vector unsigned short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_nor(vector unsigned short __a, vector bool short __b) {
   return ~(__a | __b);
@@ -4965,11 +5781,13 @@
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_nor(vector bool int __a, vector signed int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_nor(vector signed int __a, vector bool int __b) {
   return ~(__a | __b);
@@ -4980,11 +5798,13 @@
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_nor(vector bool int __a, vector unsigned int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_nor(vector unsigned int __a, vector bool int __b) {
   return ~(__a | __b);
@@ -5000,11 +5820,13 @@
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_nor(vector bool long long __a, vector signed long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_nor(vector signed long long __a, vector bool long long __b) {
   return ~(__a | __b);
@@ -5015,34 +5837,274 @@
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_nor(vector bool long long __a, vector unsigned long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_nor(vector unsigned long long __a, vector bool long long __b) {
   return ~(__a | __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nor(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a |
+                         (vector unsigned int)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_nor(vector double __a, vector double __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_nor(vector bool long long __a, vector double __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_nor(vector double __a, vector bool long long __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+/*-- vec_orc ----------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_orc(vector bool char __a, vector bool char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_orc(vector signed char __a, vector signed char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_orc(vector unsigned char __a, vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_orc(vector bool short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_orc(vector signed short __a, vector signed short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_orc(vector unsigned short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_orc(vector bool int __a, vector bool int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_orc(vector signed int __a, vector signed int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_orc(vector unsigned int __a, vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_orc(vector bool long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_orc(vector signed long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector float
+vec_orc(vector float __a, vector float __b) {
+  return (vector float)((vector unsigned int)__a &
+                        ~(vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_orc(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+#endif
+
+/*-- vec_nand ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_nand(vector bool char __a, vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nand(vector signed char __a, vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nand(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_nand(vector bool short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nand(vector signed short __a, vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nand(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_nand(vector bool int __a, vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nand(vector signed int __a, vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nand(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_nand(vector bool long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nand(vector signed long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_nand(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a &
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nand(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a &
+                          (vector unsigned long long)__b);
+}
+#endif
+
+/*-- vec_eqv ----------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_eqv(vector bool char __a, vector bool char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_eqv(vector signed char __a, vector signed char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_eqv(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_eqv(vector bool short __a, vector bool short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_eqv(vector signed short __a, vector signed short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_eqv(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_eqv(vector bool int __a, vector bool int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_eqv(vector signed int __a, vector signed int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_eqv(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_eqv(vector bool long long __a, vector bool long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_eqv(vector signed long long __a, vector signed long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_eqv(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a ^
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_eqv(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a ^
+                          (vector unsigned long long)__b);
+}
+#endif
+
 /*-- vec_cntlz --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector unsigned char
@@ -5323,30 +6385,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sll(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sll(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsl(
@@ -5358,11 +6425,13 @@
   return __builtin_s390_vsl(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sll(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsl(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sll(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsl(__a, (vector unsigned char)__b);
@@ -5374,30 +6443,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sll(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sll(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsl(
@@ -5410,12 +6484,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sll(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sll(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsl(
@@ -5428,30 +6504,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sll(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sll(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsl(
@@ -5464,12 +6545,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sll(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sll(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsl(
@@ -5482,30 +6565,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sll(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sll(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsl(
@@ -5518,12 +6606,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sll(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sll(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsl(
@@ -5626,6 +6716,20 @@
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_slb(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_slb(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_slb(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vslb(
@@ -5644,6 +6748,10 @@
 vec_sld(vector signed char __a, vector signed char __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool char
+vec_sld(vector bool char __a, vector bool char __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned char
 vec_sld(vector unsigned char __a, vector unsigned char __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5652,6 +6760,10 @@
 vec_sld(vector signed short __a, vector signed short __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool short
+vec_sld(vector bool short __a, vector bool short __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned short
 vec_sld(vector unsigned short __a, vector unsigned short __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5660,6 +6772,10 @@
 vec_sld(vector signed int __a, vector signed int __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool int
+vec_sld(vector bool int __a, vector bool int __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned int
 vec_sld(vector unsigned int __a, vector unsigned int __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5668,10 +6784,20 @@
 vec_sld(vector signed long long __a, vector signed long long __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool long long
+vec_sld(vector bool long long __a, vector bool long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned long long
 vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 15);
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector float
+vec_sld(vector float __a, vector float __b, int __c)
+  __constant_range(__c, 0, 15);
+#endif
+
 extern __ATTRS_o vector double
 vec_sld(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5714,6 +6840,7 @@
 vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector double
 vec_sldw(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 3);
@@ -5730,30 +6857,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sral(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sral(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsra(
@@ -5765,11 +6897,13 @@
   return __builtin_s390_vsra(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sral(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsra(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sral(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsra(__a, (vector unsigned char)__b);
@@ -5781,30 +6915,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sral(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sral(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsra(
@@ -5817,12 +6956,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sral(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sral(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsra(
@@ -5835,30 +6976,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sral(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sral(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsra(
@@ -5871,12 +7017,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sral(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sral(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsra(
@@ -5889,30 +7037,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sral(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sral(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsra(
@@ -5925,12 +7078,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sral(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sral(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsra(
@@ -6033,6 +7188,20 @@
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_srab(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_srab(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_srab(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vsrab(
@@ -6053,30 +7222,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_srl(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_srl(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsrl(
@@ -6088,11 +7262,13 @@
   return __builtin_s390_vsrl(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_srl(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_srl(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
@@ -6104,30 +7280,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_srl(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_srl(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsrl(
@@ -6140,12 +7321,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_srl(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_srl(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsrl(
@@ -6158,30 +7341,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_srl(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_srl(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsrl(
@@ -6194,12 +7382,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_srl(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_srl(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsrl(
@@ -6212,30 +7402,35 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_srl(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_srl(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsrl(
@@ -6248,12 +7443,14 @@
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_srl(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_srl(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsrl(
@@ -6356,6 +7553,20 @@
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_srb(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_srb(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_srb(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vsrlb(
@@ -6390,6 +7601,13 @@
   return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_abs(vector float __a) {
+  return __builtin_s390_vflpsb(__a);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_abs(vector double __a) {
   return __builtin_s390_vflpdb(__a);
@@ -6397,7 +7615,14 @@
 
 /*-- vec_nabs ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nabs(vector float __a) {
+  return __builtin_s390_vflnsb(__a);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_nabs(vector double __a) {
   return __builtin_s390_vflndb(__a);
 }
@@ -6409,12 +7634,14 @@
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_max(vector signed char __a, vector bool char __b) {
   vector signed char __bc = (vector signed char)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_max(vector bool char __a, vector signed char __b) {
   vector signed char __ac = (vector signed char)__a;
@@ -6426,12 +7653,14 @@
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_max(vector unsigned char __a, vector bool char __b) {
   vector unsigned char __bc = (vector unsigned char)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_max(vector bool char __a, vector unsigned char __b) {
   vector unsigned char __ac = (vector unsigned char)__a;
@@ -6443,12 +7672,14 @@
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_max(vector signed short __a, vector bool short __b) {
   vector signed short __bc = (vector signed short)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_max(vector bool short __a, vector signed short __b) {
   vector signed short __ac = (vector signed short)__a;
@@ -6460,12 +7691,14 @@
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_max(vector unsigned short __a, vector bool short __b) {
   vector unsigned short __bc = (vector unsigned short)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_max(vector bool short __a, vector unsigned short __b) {
   vector unsigned short __ac = (vector unsigned short)__a;
@@ -6477,12 +7710,14 @@
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_max(vector signed int __a, vector bool int __b) {
   vector signed int __bc = (vector signed int)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_max(vector bool int __a, vector signed int __b) {
   vector signed int __ac = (vector signed int)__a;
@@ -6494,12 +7729,14 @@
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_max(vector unsigned int __a, vector bool int __b) {
   vector unsigned int __bc = (vector unsigned int)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_max(vector bool int __a, vector unsigned int __b) {
   vector unsigned int __ac = (vector unsigned int)__a;
@@ -6511,12 +7748,14 @@
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_max(vector signed long long __a, vector bool long long __b) {
   vector signed long long __bc = (vector signed long long)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_max(vector bool long long __a, vector signed long long __b) {
   vector signed long long __ac = (vector signed long long)__a;
@@ -6528,21 +7767,34 @@
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_max(vector unsigned long long __a, vector bool long long __b) {
   vector unsigned long long __bc = (vector unsigned long long)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_max(vector bool long long __a, vector unsigned long long __b) {
   vector unsigned long long __ac = (vector unsigned long long)__a;
   return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_max(vector float __a, vector float __b) {
+  return __builtin_s390_vfmaxsb(__a, __b, 0);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_max(vector double __a, vector double __b) {
+#if __ARCH__ >= 12
+  return __builtin_s390_vfmaxdb(__a, __b, 0);
+#else
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+#endif
 }
 
 /*-- vec_min ----------------------------------------------------------------*/
@@ -6552,12 +7804,14 @@
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_min(vector signed char __a, vector bool char __b) {
   vector signed char __bc = (vector signed char)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_min(vector bool char __a, vector signed char __b) {
   vector signed char __ac = (vector signed char)__a;
@@ -6569,12 +7823,14 @@
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_min(vector unsigned char __a, vector bool char __b) {
   vector unsigned char __bc = (vector unsigned char)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_min(vector bool char __a, vector unsigned char __b) {
   vector unsigned char __ac = (vector unsigned char)__a;
@@ -6586,12 +7842,14 @@
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_min(vector signed short __a, vector bool short __b) {
   vector signed short __bc = (vector signed short)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_min(vector bool short __a, vector signed short __b) {
   vector signed short __ac = (vector signed short)__a;
@@ -6603,12 +7861,14 @@
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_min(vector unsigned short __a, vector bool short __b) {
   vector unsigned short __bc = (vector unsigned short)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_min(vector bool short __a, vector unsigned short __b) {
   vector unsigned short __ac = (vector unsigned short)__a;
@@ -6620,12 +7880,14 @@
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_min(vector signed int __a, vector bool int __b) {
   vector signed int __bc = (vector signed int)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_min(vector bool int __a, vector signed int __b) {
   vector signed int __ac = (vector signed int)__a;
@@ -6637,12 +7899,14 @@
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_min(vector unsigned int __a, vector bool int __b) {
   vector unsigned int __bc = (vector unsigned int)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_min(vector bool int __a, vector unsigned int __b) {
   vector unsigned int __ac = (vector unsigned int)__a;
@@ -6654,12 +7918,14 @@
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_min(vector signed long long __a, vector bool long long __b) {
   vector signed long long __bc = (vector signed long long)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_min(vector bool long long __a, vector signed long long __b) {
   vector signed long long __ac = (vector signed long long)__a;
@@ -6671,21 +7937,34 @@
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_min(vector unsigned long long __a, vector bool long long __b) {
   vector unsigned long long __bc = (vector unsigned long long)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_min(vector bool long long __a, vector unsigned long long __b) {
   vector unsigned long long __ac = (vector unsigned long long)__a;
   return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_min(vector float __a, vector float __b) {
+  return __builtin_s390_vfminsb(__a, __b, 0);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_min(vector double __a, vector double __b) {
+#if __ARCH__ >= 12
+  return __builtin_s390_vfmindb(__a, __b, 0);
+#else
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+#endif
 }
 
 /*-- vec_add_u128 -----------------------------------------------------------*/
@@ -7126,6 +8405,13 @@
   return __builtin_s390_vmlof(__a, __b);
 }
 
+/*-- vec_msum_u128 ----------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+#define vec_msum_u128(X, Y, Z, W) \
+  ((vector unsigned char)__builtin_s390_vmslg((X), (Y), (Z), (W)));
+#endif
+
 /*-- vec_sub_u128 -----------------------------------------------------------*/
 
 static inline __ATTRS_ai vector unsigned char
@@ -7263,6 +8549,14 @@
                             (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_test_mask(vector float __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_test_mask(vector double __a, vector unsigned long long __b) {
   return __builtin_s390_vtm((vector unsigned char)__a,
@@ -7271,27 +8565,77 @@
 
 /*-- vec_madd ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_madd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfmasb(__a, __b, __c);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_madd(vector double __a, vector double __b, vector double __c) {
   return __builtin_s390_vfmadb(__a, __b, __c);
 }
 
 /*-- vec_msub ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_msub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfmssb(__a, __b, __c);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_msub(vector double __a, vector double __b, vector double __c) {
   return __builtin_s390_vfmsdb(__a, __b, __c);
 }
 
+/*-- vec_nmadd ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nmadd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfnmasb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nmadd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfnmadb(__a, __b, __c);
+}
+#endif
+
+/*-- vec_nmsub ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nmsub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfnmssb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nmsub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfnmsdb(__a, __b, __c);
+}
+#endif
+
 /*-- vec_sqrt ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_sqrt(vector float __a) {
+  return __builtin_s390_vfsqsb(__a);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_sqrt(vector double __a) {
   return __builtin_s390_vfsqdb(__a);
 }
 
 /*-- vec_ld2f ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai vector double
 vec_ld2f(const float *__ptr) {
   typedef float __v2f32 __attribute__((__vector_size__(8)));
@@ -7300,6 +8644,7 @@
 
 /*-- vec_st2f ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai void
 vec_st2f(vector double __a, float *__ptr) {
   typedef float __v2f32 __attribute__((__vector_size__(8)));
@@ -7308,6 +8653,7 @@
 
 /*-- vec_ctd ----------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_ctd(vector signed long long __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7316,6 +8662,7 @@
   return __conv;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_ctd(vector unsigned long long __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7326,6 +8673,7 @@
 
 /*-- vec_ctsl ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_ctsl(vector double __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7335,6 +8683,7 @@
 
 /*-- vec_ctul ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_ctul(vector double __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7342,16 +8691,79 @@
   return __builtin_convertvector(__a, vector unsigned long long);
 }
 
+/*-- vec_doublee ------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector double
+vec_doublee(vector float __a) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  __v2f32 __pack = __builtin_shufflevector(__a, __a, 0, 2);
+  return __builtin_convertvector(__pack, vector double);
+}
+#endif
+
+/*-- vec_floate -------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector float
+vec_floate(vector double __a) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  __v2f32 __pack = __builtin_convertvector(__a, __v2f32);
+  return __builtin_shufflevector(__pack, __pack, 0, -1, 1, -1);
+}
+#endif
+
+/*-- vec_double -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector double
+vec_double(vector signed long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_double(vector unsigned long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+/*-- vec_signed -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_signed(vector double __a) {
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+/*-- vec_unsigned -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unsigned(vector double __a) {
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
 /*-- vec_roundp -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundp(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 6);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundp(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 6);
 }
 
 /*-- vec_ceil ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_ceil(vector float __a) {
+  // On this platform, vec_ceil never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 6);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_ceil(vector double __a) {
   // On this platform, vec_ceil never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 6);
@@ -7359,14 +8771,29 @@
 
 /*-- vec_roundm -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundm(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 7);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundm(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 7);
 }
 
 /*-- vec_floor --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_floor(vector float __a) {
+  // On this platform, vec_floor never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 7);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_floor(vector double __a) {
   // On this platform, vec_floor never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 7);
@@ -7374,14 +8801,29 @@
 
 /*-- vec_roundz -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundz(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 5);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundz(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 5);
 }
 
 /*-- vec_trunc --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_trunc(vector float __a) {
+  // On this platform, vec_trunc never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 5);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_trunc(vector double __a) {
   // On this platform, vec_trunc never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 5);
@@ -7389,22 +8831,104 @@
 
 /*-- vec_roundc -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundc(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 0);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundc(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 0);
 }
 
+/*-- vec_rint ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_rint(vector float __a) {
+  // vec_rint may trigger the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 0, 0);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
+vec_rint(vector double __a) {
+  // vec_rint may trigger the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 0, 0);
+}
+
 /*-- vec_round --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_round(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 4);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_round(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 4);
 }
 
 /*-- vec_fp_test_data_class -------------------------------------------------*/
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector bool int
+vec_fp_test_data_class(vector float __a, int __b, int *__c)
+  __constant_range(__b, 0, 4095);
+
+extern __ATTRS_o vector bool long long
+vec_fp_test_data_class(vector double __a, int __b, int *__c)
+  __constant_range(__b, 0, 4095);
+
+#define vec_fp_test_data_class(X, Y, Z) \
+  ((__typeof__((vec_fp_test_data_class)((X), (Y), (Z)))) \
+   __extension__ ({ \
+     vector unsigned char __res; \
+     vector unsigned char __x = (vector unsigned char)(X); \
+     int *__z = (Z); \
+     switch (sizeof ((X)[0])) { \
+     case 4:  __res = (vector unsigned char) \
+                      __builtin_s390_vftcisb((vector float)__x, (Y), __z); \
+              break; \
+     default: __res = (vector unsigned char) \
+                      __builtin_s390_vftcidb((vector double)__x, (Y), __z); \
+              break; \
+     } __res; }))
+#else
 #define vec_fp_test_data_class(X, Y, Z) \
   ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z)))
+#endif
+
+#define __VEC_CLASS_FP_ZERO_P (1 << 11)
+#define __VEC_CLASS_FP_ZERO_N (1 << 10)
+#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P | __VEC_CLASS_FP_ZERO_N)
+#define __VEC_CLASS_FP_NORMAL_P (1 << 9)
+#define __VEC_CLASS_FP_NORMAL_N (1 << 8)
+#define __VEC_CLASS_FP_NORMAL (__VEC_CLASS_FP_NORMAL_P | \
+                               __VEC_CLASS_FP_NORMAL_N)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 7)
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 6)
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \
+                                  __VEC_CLASS_FP_SUBNORMAL_N)
+#define __VEC_CLASS_FP_INFINITY_P (1 << 5)
+#define __VEC_CLASS_FP_INFINITY_N (1 << 4)
+#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P | \
+                                 __VEC_CLASS_FP_INFINITY_N)
+#define __VEC_CLASS_FP_QNAN_P (1 << 3)
+#define __VEC_CLASS_FP_QNAN_N (1 << 2)
+#define __VEC_CLASS_FP_QNAN (__VEC_CLASS_FP_QNAN_P | __VEC_CLASS_FP_QNAN_N)
+#define __VEC_CLASS_FP_SNAN_P (1 << 1)
+#define __VEC_CLASS_FP_SNAN_N (1 << 0)
+#define __VEC_CLASS_FP_SNAN (__VEC_CLASS_FP_SNAN_P | __VEC_CLASS_FP_SNAN_N)
+#define __VEC_CLASS_FP_NAN (__VEC_CLASS_FP_QNAN | __VEC_CLASS_FP_SNAN)
+#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN | \
+                                   __VEC_CLASS_FP_SUBNORMAL | \
+                                   __VEC_CLASS_FP_ZERO | \
+                                   __VEC_CLASS_FP_INFINITY)
 
 /*-- vec_cp_until_zero ------------------------------------------------------*/
 
diff --git a/darwin-x86/clang-headers/vpclmulqdqintrin.h b/darwin-x86/clang-headers/vpclmulqdqintrin.h
new file mode 100644
index 0000000..86174a4
--- /dev/null
+++ b/darwin-x86/clang-headers/vpclmulqdqintrin.h
@@ -0,0 +1,42 @@
+/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VPCLMULQDQINTRIN_H
+#define __VPCLMULQDQINTRIN_H
+
+#define _mm256_clmulepi64_epi128(A, B, I) \
+  (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A),  \
+                                       (__v4di)(__m256i)(B),  \
+                                       (char)(I))
+
+#define _mm512_clmulepi64_epi128(A, B, I) \
+  (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
+                                       (__v8di)(__m512i)(B),  \
+                                       (char)(I))
+
+#endif /* __VPCLMULQDQINTRIN_H */
+
diff --git a/darwin-x86/clang-headers/waitpkgintrin.h b/darwin-x86/clang-headers/waitpkgintrin.h
new file mode 100644
index 0000000..e29d6cf
--- /dev/null
+++ b/darwin-x86/clang-headers/waitpkgintrin.h
@@ -0,0 +1,56 @@
+/*===----------------------- waitpkgintrin.h - WAITPKG --------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <waitpkgintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __WAITPKGINTRIN_H
+#define __WAITPKGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("waitpkg")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_umonitor (void * __address)
+{
+  __builtin_ia32_umonitor (__address);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_umwait (unsigned int __control, unsigned long long __counter)
+{
+  return __builtin_ia32_umwait (__control,
+    (unsigned int)(__counter >> 32), (unsigned int)__counter);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_tpause (unsigned int __control, unsigned long long __counter)
+{
+  return __builtin_ia32_tpause (__control,
+    (unsigned int)(__counter >> 32), (unsigned int)__counter);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __WAITPKGINTRIN_H */
diff --git a/darwin-x86/clang-headers/wbnoinvdintrin.h b/darwin-x86/clang-headers/wbnoinvdintrin.h
new file mode 100644
index 0000000..cad8336
--- /dev/null
+++ b/darwin-x86/clang-headers/wbnoinvdintrin.h
@@ -0,0 +1,38 @@
+/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __WBNOINVDINTRIN_H
+#define __WBNOINVDINTRIN_H
+
+static __inline__ void
+  __attribute__((__always_inline__, __nodebug__,  __target__("wbnoinvd")))
+_wbnoinvd (void)
+{
+  __builtin_ia32_wbnoinvd ();
+}
+
+#endif /* __WBNOINVDINTRIN_H */
diff --git a/darwin-x86/clang-headers/wmmintrin.h b/darwin-x86/clang-headers/wmmintrin.h
index a2d9310..569a8d8 100644
--- a/darwin-x86/clang-headers/wmmintrin.h
+++ b/darwin-x86/clang-headers/wmmintrin.h
@@ -21,8 +21,8 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _WMMINTRIN_H
-#define _WMMINTRIN_H
+#ifndef __WMMINTRIN_H
+#define __WMMINTRIN_H
 
 #include <emmintrin.h>
 
@@ -30,4 +30,4 @@
 
 #include <__wmmintrin_pclmul.h>
 
-#endif /* _WMMINTRIN_H */
+#endif /* __WMMINTRIN_H */
diff --git a/darwin-x86/clang-headers/x86intrin.h b/darwin-x86/clang-headers/x86intrin.h
index 81a404f..728c58c 100644
--- a/darwin-x86/clang-headers/x86intrin.h
+++ b/darwin-x86/clang-headers/x86intrin.h
@@ -32,26 +32,6 @@
 #include <mm3dnow.h>
 #endif
 
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
-#include <bmiintrin.h>
-#endif
-
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
-#include <bmi2intrin.h>
-#endif
-
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
-#include <lzcntintrin.h>
-#endif
-
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
-#include <popcntintrin.h>
-#endif
-
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
-#include <rdseedintrin.h>
-#endif
-
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
 #include <prfchwintrin.h>
 #endif
@@ -72,14 +52,17 @@
 #include <tbmintrin.h>
 #endif
 
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
-#include <f16cintrin.h>
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LWP__)
+#include <lwpintrin.h>
 #endif
 
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
 #include <mwaitxintrin.h>
 #endif
 
-/* FIXME: LWP */
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLZERO__)
+#include <clzerointrin.h>
+#endif
+
 
 #endif /* __X86INTRIN_H */
diff --git a/darwin-x86/clang-headers/xmmintrin.h b/darwin-x86/clang-headers/xmmintrin.h
index 3110e8b..17af172 100644
--- a/darwin-x86/clang-headers/xmmintrin.h
+++ b/darwin-x86/clang-headers/xmmintrin.h
@@ -40,13 +40,14 @@
 #endif
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
 
-/// \brief Adds the 32-bit float values in the low-order bits of the operands.
+/// Adds the 32-bit float values in the low-order bits of the operands.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
+/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -64,12 +65,12 @@
   return __a;
 }
 
-/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
+/// Adds two 128-bit vectors of [4 x float], and returns the results of
 ///    the addition.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
+/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -83,12 +84,12 @@
   return (__m128)((__v4sf)__a + (__v4sf)__b);
 }
 
-/// \brief Subtracts the 32-bit float value in the low-order bits of the second
+/// Subtracts the 32-bit float value in the low-order bits of the second
 ///    operand from the corresponding value in the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
+/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
@@ -106,13 +107,13 @@
   return __a;
 }
 
-/// \brief Subtracts each of the values of the second operand from the first
+/// Subtracts each of the values of the second operand from the first
 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
 ///    the results of the subtraction.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
+/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the minuend.
@@ -126,12 +127,12 @@
   return (__m128)((__v4sf)__a - (__v4sf)__b);
 }
 
-/// \brief Multiplies two 32-bit float values in the low-order bits of the
+/// Multiplies two 32-bit float values in the low-order bits of the
 ///    operands.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
+/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -149,12 +150,12 @@
   return __a;
 }
 
-/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
+/// Multiplies two 128-bit vectors of [4 x float] and returns the
 ///    results of the multiplication.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
+/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -168,12 +169,12 @@
   return (__m128)((__v4sf)__a * (__v4sf)__b);
 }
 
-/// \brief Divides the value in the low-order 32 bits of the first operand by
+/// Divides the value in the low-order 32 bits of the first operand by
 ///    the corresponding value in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
+/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
@@ -191,11 +192,11 @@
   return __a;
 }
 
-/// \brief Divides two 128-bit vectors of [4 x float].
+/// Divides two 128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
+/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the dividend.
@@ -209,12 +210,12 @@
   return (__m128)((__v4sf)__a / (__v4sf)__b);
 }
 
-/// \brief Calculates the square root of the value stored in the low-order bits
+/// Calculates the square root of the value stored in the low-order bits
 ///    of a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
+/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -224,16 +225,15 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sqrt_ss(__m128 __a)
 {
-  __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
-  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
 }
 
-/// \brief Calculates the square roots of the values stored in a 128-bit vector
+/// Calculates the square roots of the values stored in a 128-bit vector
 ///    of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
+/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -245,12 +245,12 @@
   return __builtin_ia32_sqrtps((__v4sf)__a);
 }
 
-/// \brief Calculates the approximate reciprocal of the value stored in the
+/// Calculates the approximate reciprocal of the value stored in the
 ///    low-order bits of a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
+/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -260,16 +260,15 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp_ss(__m128 __a)
 {
-  __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
-  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+  return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
 }
 
-/// \brief Calculates the approximate reciprocals of the values stored in a
+/// Calculates the approximate reciprocals of the values stored in a
 ///    128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
+/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -278,15 +277,15 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp_ps(__m128 __a)
 {
-  return __builtin_ia32_rcpps((__v4sf)__a);
+  return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
 }
 
-/// \brief Calculates the approximate reciprocal of the square root of the value
+/// Calculates the approximate reciprocal of the square root of the value
 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
+/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -297,16 +296,15 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt_ss(__m128 __a)
 {
-  __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
-  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+  return __builtin_ia32_rsqrtss((__v4sf)__a);
 }
 
-/// \brief Calculates the approximate reciprocals of the square roots of the
+/// Calculates the approximate reciprocals of the square roots of the
 ///    values stored in a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
+/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -318,13 +316,13 @@
   return __builtin_ia32_rsqrtps((__v4sf)__a);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands and returns the lesser value in the low-order bits of the
 ///    vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
+/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -341,12 +339,12 @@
   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 128-bit vectors of [4 x float] and returns the
-///    lesser of each pair of values.
+/// Compares two 128-bit vectors of [4 x float] and returns the lesser
+///    of each pair of values.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
+/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands.
@@ -360,13 +358,13 @@
   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
-///    operands and returns the greater value in the low-order bits of
-///    a vector [4 x float].
+/// Compares two 32-bit float values in the low-order bits of both
+///    operands and returns the greater value in the low-order bits of a 128-bit
+///    vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
+/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -383,12 +381,12 @@
   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
+/// Compares two 128-bit vectors of [4 x float] and returns the greater
 ///    of each pair of values.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
+/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands.
@@ -402,11 +400,11 @@
   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
+/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
+/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector containing one of the source operands.
@@ -420,13 +418,13 @@
   return (__m128)((__v4su)__a & (__v4su)__b);
 }
 
-/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
+/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
 ///    the one's complement of the values contained in the first source
 ///    operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
+/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the first source operand. The
@@ -442,11 +440,11 @@
   return (__m128)(~(__v4su)__a & (__v4su)__b);
 }
 
-/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
+/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VORPS / ORPS instructions.
+/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -460,12 +458,12 @@
   return (__m128)((__v4su)__a | (__v4su)__b);
 }
 
-/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
+/// Performs a bitwise exclusive OR of two 128-bit vectors of
 ///    [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
@@ -479,13 +477,13 @@
   return (__m128)((__v4su)__a ^ (__v4su)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands for equality and returns the result of the comparison in the
 ///    low-order bits of a vector [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
+/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -501,12 +499,12 @@
   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] for equality.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
+/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -519,14 +517,14 @@
   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is less than the
 ///    corresponding value in the second operand and returns the result of the
 ///    comparison in the low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -542,13 +540,13 @@
   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are less than those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -561,7 +559,7 @@
   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is less than or
 ///    equal to the corresponding value in the second operand and returns the
 ///    result of the comparison in the low-order bits of a vector of
@@ -569,7 +567,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -585,13 +583,13 @@
   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are less than or equal to those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -604,14 +602,14 @@
   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is greater than
 ///    the corresponding value in the second operand and returns the result of
 ///    the comparison in the low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -629,13 +627,13 @@
                                          4, 1, 2, 3);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are greater than those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -648,7 +646,7 @@
   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is greater than
 ///    or equal to the corresponding value in the second operand and returns
 ///    the result of the comparison in the low-order bits of a vector of
@@ -656,7 +654,7 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -674,13 +672,13 @@
                                          4, 1, 2, 3);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are greater than or equal to those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -693,13 +691,14 @@
   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands for inequality and returns the result of the comparison in the
 ///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -715,12 +714,13 @@
   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] for inequality.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -733,14 +733,15 @@
   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not less than
 ///    the corresponding value in the second operand and returns the result of
 ///    the comparison in the low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -756,13 +757,14 @@
   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not less than those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -775,7 +777,7 @@
   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not less than
 ///    or equal to the corresponding value in the second operand and returns
 ///    the result of the comparison in the low-order bits of a vector of
@@ -783,7 +785,8 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -799,13 +802,14 @@
   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not less than or equal to those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -818,7 +822,7 @@
   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not greater
 ///    than the corresponding value in the second operand and returns the
 ///    result of the comparison in the low-order bits of a vector of
@@ -826,7 +830,8 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -844,13 +849,14 @@
                                          4, 1, 2, 3);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not greater than those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -863,7 +869,7 @@
   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not greater
 ///    than or equal to the corresponding value in the second operand and
 ///    returns the result of the comparison in the low-order bits of a vector
@@ -871,7 +877,8 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -889,13 +896,14 @@
                                          4, 1, 2, 3);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not greater than or equal to those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -908,7 +916,7 @@
   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is ordered with
 ///    respect to the corresponding value in the second operand and returns the
 ///    result of the comparison in the low-order bits of a vector of
@@ -916,7 +924,8 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
+/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -932,13 +941,14 @@
   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are ordered with respect to those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
+/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -951,7 +961,7 @@
   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is unordered
 ///    with respect to the corresponding value in the second operand and
 ///    returns the result of the comparison in the low-order bits of a vector
@@ -959,7 +969,8 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
+/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -975,13 +986,14 @@
   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares each of the corresponding 32-bit float values of the
+/// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are unordered with respect to those in the second operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
+/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -994,12 +1006,15 @@
   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands for equality and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1007,20 +1022,24 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the
+///    two lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is less than the second
 ///    operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1028,20 +1047,23 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is less than or equal to the
 ///    second operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1049,20 +1071,23 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is greater than the second
 ///    operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1070,20 +1095,23 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the
+///     two lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is greater than or equal to
 ///    the second operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1091,20 +1119,23 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Compares two 32-bit float values in the low-order bits of both
+/// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is not equal to the second
 ///    operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1112,20 +1143,23 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the
+///     two lower 32-bit values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Performs an unordered comparison of two 32-bit float values using
+/// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine equality and returns
 ///    the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1133,20 +1167,23 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Performs an unordered comparison of two 32-bit float values using
+/// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
 ///    less than the second operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1154,21 +1191,24 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine if the first operand
-///    is less than or equal to the second operand and returns the result of
-///    the comparison.
+/// Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    less than or equal to the second operand and returns the result of the
+///    comparison.
+///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1176,21 +1216,24 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine if the first operand
-///    is greater than the second operand and returns the result of the
+/// Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    greater than the second operand and returns the result of the
 ///    comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1198,21 +1241,24 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Performs an unordered comparison of two 32-bit float values using
+/// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
 ///    greater than or equal to the second operand and returns the result of
 ///    the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1220,20 +1266,23 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Performs an unordered comparison of two 32-bit float values using
+/// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine inequality and returns
 ///    the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1241,19 +1290,21 @@
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower 32-bit values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Converts a float value contained in the lower 32 bits of a vector of
+/// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1265,12 +1316,13 @@
   return __builtin_ia32_cvtss2si((__v4sf)__a);
 }
 
-/// \brief Converts a float value contained in the lower 32 bits of a vector of
+/// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1284,12 +1336,13 @@
 
 #ifdef __x86_64__
 
-/// \brief Converts a float value contained in the lower 32 bits of a vector of
+/// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 64-bit integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1303,45 +1356,46 @@
 
 #endif
 
-/// \brief Converts two low-order float values in a 128-bit vector of
+/// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtps_pi32(__m128 __a)
 {
   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
 }
 
-/// \brief Converts two low-order float values in a 128-bit vector of
+/// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_cvt_ps2pi(__m128 __a)
 {
   return _mm_cvtps_pi32(__a);
 }
 
-/// \brief Converts a float value contained in the lower 32 bits of a vector of
+/// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer, truncating the result when it is
 ///    inexact.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1350,16 +1404,17 @@
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttss_si32(__m128 __a)
 {
-  return __a[0];
+  return __builtin_ia32_cvttss2si((__v4sf)__a);
 }
 
-/// \brief Converts a float value contained in the lower 32 bits of a vector of
+/// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer, truncating the result when it is
 ///    inexact.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1371,13 +1426,15 @@
   return _mm_cvttss_si32(__a);
 }
 
-/// \brief Converts a float value contained in the lower 32 bits of a vector of
+#ifdef __x86_64__
+/// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 64-bit integer, truncating the result when it is
 ///    inexact.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1386,51 +1443,53 @@
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttss_si64(__m128 __a)
 {
-  return __a[0];
+  return __builtin_ia32_cvttss2si64((__v4sf)__a);
 }
+#endif
 
-/// \brief Converts two low-order float values in a 128-bit vector of
+/// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
 ///    when it is inexact.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
+/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
+///   instructions.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_cvttps_pi32(__m128 __a)
 {
   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
 }
 
-/// \brief Converts two low-order float values in a 128-bit vector of [4 x
+/// Converts two low-order float values in a 128-bit vector of [4 x
 ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
 ///    is inexact.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtt_ps2pi(__m128 __a)
 {
   return _mm_cvttps_pi32(__a);
 }
 
-/// \brief Converts a 32-bit signed integer value into a floating point value
+/// Converts a 32-bit signed integer value into a floating point value
 ///    and writes it to the lower 32 bits of the destination. The remaining
 ///    higher order elements of the destination vector are copied from the
 ///    corresponding elements in the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1446,14 +1505,14 @@
   return __a;
 }
 
-/// \brief Converts a 32-bit signed integer value into a floating point value
+/// Converts a 32-bit signed integer value into a floating point value
 ///    and writes it to the lower 32 bits of the destination. The remaining
 ///    higher order elements of the destination are copied from the
 ///    corresponding elements in the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1470,14 +1529,14 @@
 
 #ifdef __x86_64__
 
-/// \brief Converts a 64-bit signed integer value into a floating point value
+/// Converts a 64-bit signed integer value into a floating point value
 ///    and writes it to the lower 32 bits of the destination. The remaining
 ///    higher order elements of the destination are copied from the
 ///    corresponding elements in the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1495,14 +1554,14 @@
 
 #endif
 
-/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
+/// Converts two elements of a 64-bit vector of [2 x i32] into two
 ///    floating point values and writes them to the lower 64-bits of the
 ///    destination. The remaining higher order elements of the destination are
 ///    copied from the corresponding elements in the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1512,20 +1571,20 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
 }
 
-/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
+/// Converts two elements of a 64-bit vector of [2 x i32] into two
 ///    floating point values and writes them to the lower 64-bits of the
 ///    destination. The remaining higher order elements of the destination are
 ///    copied from the corresponding elements in the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float].
@@ -1535,18 +1594,18 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value from the second operand. The upper 64 bits are copied
 ///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
   return _mm_cvtpi32_ps(__a, __b);
 }
 
-/// \brief Extracts a float value contained in the lower 32 bits of a vector of
+/// Extracts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic has no corresponding instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1558,13 +1617,13 @@
   return __a[0];
 }
 
-/// \brief Loads two packed float values from the address __p into the
+/// Loads two packed float values from the address \a __p into the
 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
 ///     are copied from the low-order bits of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
@@ -1585,13 +1644,13 @@
   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
 }
 
-/// \brief Loads two packed float values from the address __p into the low-order
-///    bits of a 128-bit vector of [4 x float]. The high-order bits are copied
-///    from the high-order bits of the first operand.
+/// Loads two packed float values from the address \a __p into the
+///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
+///    are copied from the high-order bits of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
@@ -1612,14 +1671,14 @@
   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
 ///    32 bits of the vector are initialized with the single-precision
 ///    floating-point value loaded from a specified memory location. The upper
 ///    96 bits are set to zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 32-bit memory location containing a single-precision
@@ -1634,21 +1693,21 @@
     float __u;
   } __attribute__((__packed__, __may_alias__));
   float __u = ((struct __mm_load_ss_struct*)__p)->__u;
-  return (__m128){ __u, 0, 0, 0 };
+  return __extension__ (__m128){ __u, 0, 0, 0 };
 }
 
-/// \brief Loads a 32-bit float value and duplicates it to all four vector
+/// Loads a 32-bit float value and duplicates it to all four vector
 ///    elements of a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
+/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
 ///    A pointer to a float value to be loaded and duplicated.
-/// \returns A 128-bit vector of [4 x float] containing the loaded
-///    and duplicated values.
+/// \returns A 128-bit vector of [4 x float] containing the loaded and
+///    duplicated values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load1_ps(const float *__p)
 {
@@ -1656,34 +1715,34 @@
     float __u;
   } __attribute__((__packed__, __may_alias__));
   float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
-  return (__m128){ __u, __u, __u, __u };
+  return __extension__ (__m128){ __u, __u, __u, __u };
 }
 
 #define        _mm_load_ps1(p) _mm_load1_ps(p)
 
-/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
+/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
 ///    memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit memory location. The address of the memory
 ///    location has to be 128-bit aligned.
-/// \returns A 128-bit vector of [4 x float] containing the loaded valus.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ps(const float *__p)
 {
   return *(__m128*)__p;
 }
 
-/// \brief Loads a 128-bit floating-point vector of [4 x float] from an
+/// Loads a 128-bit floating-point vector of [4 x float] from an
 ///    unaligned memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit memory location. The address of the memory
@@ -1698,12 +1757,12 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
-/// \brief Loads four packed float values, in reverse order, from an aligned
+/// Loads four packed float values, in reverse order, from an aligned
 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
@@ -1718,27 +1777,26 @@
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
 }
 
-/// \brief Create a 128-bit vector of [4 x float] with undefined values.
+/// Create a 128-bit vector of [4 x float] with undefined values.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic has no corresponding instruction.
 ///
 /// \returns A 128-bit vector of [4 x float] containing undefined values.
-
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_undefined_ps(void)
 {
   return (__m128)__builtin_ia32_undef128();
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
 ///    32 bits of the vector are initialized with the specified single-precision
 ///    floating-point value. The upper 96 bits are set to zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
 ///
 /// \param __w
 ///    A single-precision floating-point value used to initialize the lower 32
@@ -1749,16 +1807,16 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ss(float __w)
 {
-  return (__m128){ __w, 0, 0, 0 };
+  return __extension__ (__m128){ __w, 0, 0, 0 };
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
+/// Constructs a 128-bit floating-point vector of [4 x float], with each
 ///    of the four single-precision floating-point vector elements set to the
 ///    specified single-precision floating-point value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
 ///
 /// \param __w
 ///    A single-precision floating-point value used to initialize each vector
@@ -1767,17 +1825,17 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set1_ps(float __w)
 {
-  return (__m128){ __w, __w, __w, __w };
+  return __extension__ (__m128){ __w, __w, __w, __w };
 }
 
 /* Microsoft specific. */
-/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
+/// Constructs a 128-bit floating-point vector of [4 x float], with each
 ///    of the four single-precision floating-point vector elements set to the
 ///    specified single-precision floating-point value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
 ///
 /// \param __w
 ///    A single-precision floating-point value used to initialize each vector
@@ -1789,7 +1847,7 @@
     return _mm_set1_ps(__w);
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float]
+/// Constructs a 128-bit floating-point vector of [4 x float]
 ///    initialized with the specified single-precision floating-point values.
 ///
 /// \headerfile <x86intrin.h>
@@ -1813,10 +1871,10 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ps(float __z, float __y, float __x, float __w)
 {
-  return (__m128){ __w, __x, __y, __z };
+  return __extension__ (__m128){ __w, __x, __y, __z };
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float],
+/// Constructs a 128-bit floating-point vector of [4 x float],
 ///    initialized in reverse order with the specified 32-bit single-precision
 ///    float-point values.
 ///
@@ -1841,30 +1899,30 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setr_ps(float __z, float __y, float __x, float __w)
 {
-  return (__m128){ __z, __y, __x, __w };
+  return __extension__ (__m128){ __z, __y, __x, __w };
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
+/// Constructs a 128-bit floating-point vector of [4 x float] initialized
 ///    to zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
 ///
 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
 ///    all elements set to zero.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setzero_ps(void)
 {
-  return (__m128){ 0, 0, 0, 0 };
+  return __extension__ (__m128){ 0, 0, 0, 0 };
 }
 
-/// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
+/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
 ///    memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 64-bit memory location.
@@ -1876,12 +1934,12 @@
   __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
 }
 
-/// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
+/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
 ///     memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a memory location that will receive the float values.
@@ -1893,12 +1951,12 @@
   __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
 }
 
-/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
+/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
 ///     memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 32-bit memory location.
@@ -1913,12 +1971,12 @@
   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
 }
 
-/// \brief Stores float values from a 128-bit vector of [4 x float] to an
-///    unaligned memory location.
+/// Stores a 128-bit vector of [4 x float] to an unaligned memory
+///    location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit memory location. The address of the memory
@@ -1934,38 +1992,37 @@
   ((struct __storeu_ps*)__p)->__v = __a;
 }
 
-/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
-///    four contiguous elements in an aligned memory location.
+/// Stores a 128-bit vector of [4 x float] into an aligned memory
+///    location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
-///    instruction.
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
 ///
 /// \param __p
-///    A pointer to a 128-bit memory location.
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
 /// \param __a
-///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-///    of the four contiguous elements pointed by __p.
+///    A 128-bit vector of [4 x float] containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps(float *__p, __m128 __a)
 {
   *(__m128*)__p = __a;
 }
 
-/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
 ///    four contiguous elements in an aligned memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit memory location.
 /// \param __a
 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-///    of the four contiguous elements pointed by __p.
+///    of the four contiguous elements pointed by \a __p.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_ps(float *__p, __m128 __a)
 {
@@ -1973,30 +2030,31 @@
   _mm_store_ps(__p, __a);
 }
 
-/// \brief Stores float values from a 128-bit vector of [4 x float] to an
-///    aligned memory location.
+/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+///    four contiguous elements in an aligned memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
 ///
 /// \param __p
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 128-bit aligned.
+///    A pointer to a 128-bit memory location.
 /// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be stored.
+///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+///    of the four contiguous elements pointed by \a __p.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps1(float *__p, __m128 __a)
 {
-  return _mm_store1_ps(__p, __a);
+  _mm_store1_ps(__p, __a);
 }
 
-/// \brief Stores float values from a 128-bit vector of [4 x float] to an
+/// Stores float values from a 128-bit vector of [4 x float] to an
 ///    aligned memory location in reverse order.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
@@ -2011,16 +2069,18 @@
   _mm_store_ps(__p, __a);
 }
 
-#define _MM_HINT_T0 3
-#define _MM_HINT_T1 2
-#define _MM_HINT_T2 1
+#define _MM_HINT_ET0 7
+#define _MM_HINT_ET1 6
+#define _MM_HINT_T0  3
+#define _MM_HINT_T1  2
+#define _MM_HINT_T2  1
 #define _MM_HINT_NTA 0
 
 #ifndef _MSC_VER
 /* FIXME: We have to #define this because "sel" must be a constant integer, and
    Sema doesn't do any form of constant propagation yet. */
 
-/// \brief Loads one cache line of data from the specified address to a location
+/// Loads one cache line of data from the specified address to a location
 ///    closer to the processor.
 ///
 /// \headerfile <x86intrin.h>
@@ -2029,52 +2089,54 @@
 /// void _mm_prefetch(const void * a, const int sel);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c PREFETCHNTA instruction.
+/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
 ///
 /// \param a
 ///    A pointer to a memory location containing a cache line of data.
 /// \param sel
-///    A predefined integer constant specifying the type of prefetch operation:
-///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
-///    The PREFETCHNTA instruction will be generated.
+///    A predefined integer constant specifying the type of prefetch
+///    operation: \n
+///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
+///    PREFETCHNTA instruction will be generated. \n
 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
-///    be generated.
+///    be generated. \n
 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
-///    be generated.
+///    be generated. \n
 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
 ///    be generated.
-#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
+                                                 ((sel) >> 2) & 1, (sel) & 0x3))
 #endif
 
-/// \brief Stores a 64-bit integer in the specified aligned memory location. To
+/// Stores a 64-bit integer in the specified aligned memory location. To
 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MOVNTQ instruction.
+/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
 ///
 /// \param __p
 ///    A pointer to an aligned memory location used to store the register value.
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS_MMX
 _mm_stream_pi(__m64 *__p, __m64 __a)
 {
   __builtin_ia32_movntq(__p, __a);
 }
 
-/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
+/// Moves packed float values from a 128-bit vector of [4 x float] to a
 ///    128-bit aligned memory location. To minimize caching, the data is flagged
 ///    as non-temporal (unlikely to be used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 128-bit aligned memory location that will receive the
-///    integer values.
+///    single-precision floating-point values.
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
@@ -2083,283 +2145,301 @@
   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
 }
 
-/// \brief Forces strong memory ordering (serialization) between store
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Forces strong memory ordering (serialization) between store
 ///    instructions preceding this instruction and store instructions following
 ///    this instruction, ensuring the system completes all previous stores
 ///    before executing subsequent stores.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c SFENCE instruction.
+/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
 ///
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_sfence(void)
-{
-  __builtin_ia32_sfence();
-}
+void _mm_sfence(void);
 
-/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
 ///    returns it, as specified by the immediate integer operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
+/// \code
+/// int _mm_extract_pi16(__m64 a, int n);
+/// \endcode
 ///
-/// \param __a
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param a
 ///    A 64-bit vector of [4 x i16].
-/// \param __n
-///    An immediate integer operand that determines which bits are extracted:
-///    0: Bits [15:0] are copied to the destination.
-///    1: Bits [31:16] are copied to the destination.
-///    2: Bits [47:32] are copied to the destination.
+/// \param n
+///    An immediate integer operand that determines which bits are extracted: \n
+///    0: Bits [15:0] are copied to the destination. \n
+///    1: Bits [31:16] are copied to the destination. \n
+///    2: Bits [47:32] are copied to the destination. \n
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
-#define _mm_extract_pi16(a, n) __extension__ ({ \
-  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
+#define _mm_extract_pi16(a, n) \
+  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n)
 
-/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
+/// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
-///    specified by the immediate operand __n.
+///    specified by the immediate operand \a n.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
+/// \code
+/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
+/// \endcode
 ///
-/// \param __a
+/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
+///
+/// \param a
 ///    A 64-bit vector of [4 x i16].
-/// \param __d
+/// \param d
 ///    An integer. The lower 16-bit value from this operand is written to the
-///    destination at the offset specified by operand __n.
-/// \param __n
+///    destination at the offset specified by operand \a n.
+/// \param n
 ///    An immediate integer operant that determines which the bits to be used
-///    in the destination.
-///    0: Bits [15:0] are copied to the destination.
-///    1: Bits [31:16] are copied to the destination.
-///    2: Bits [47:32] are copied to the destination.
-///    3: Bits [63:48] are copied to the destination.
+///    in the destination. \n
+///    0: Bits [15:0] are copied to the destination. \n
+///    1: Bits [31:16] are copied to the destination. \n
+///    2: Bits [47:32] are copied to the destination. \n
+///    3: Bits [63:48] are copied to the destination.  \n
 ///    The remaining bits in the destination are copied from the corresponding
-///    bits in operand __a.
+///    bits in operand \a a.
 /// \returns A 64-bit integer vector containing the copied packed data from the
 ///    operands.
-#define _mm_insert_pi16(a, d, n) __extension__ ({ \
-  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
+#define _mm_insert_pi16(a, d, n) \
+  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n)
 
-/// \brief Compares each of the corresponding packed 16-bit integer values of
+/// Compares each of the corresponding packed 16-bit integer values of
 ///    the 64-bit integer vectors, and writes the greater value to the
 ///    corresponding bits in the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMAXSW instruction.
+/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+/// Compares each of the corresponding packed 8-bit unsigned integer
 ///    values of the 64-bit integer vectors, and writes the greater value to the
 ///    corresponding bits in the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMAXUB instruction.
+/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
 }
 
-/// \brief Compares each of the corresponding packed 16-bit integer values of
+/// Compares each of the corresponding packed 16-bit integer values of
 ///    the 64-bit integer vectors, and writes the lesser value to the
 ///    corresponding bits in the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMINSW instruction.
+/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+/// Compares each of the corresponding packed 8-bit unsigned integer
 ///    values of the 64-bit integer vectors, and writes the lesser value to the
 ///    corresponding bits in the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMINUB instruction.
+/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
 }
 
-/// \brief Takes the most significant bit from each 8-bit element in a 64-bit
-///    integer vector to create a 16-bit mask value. Zero-extends the value to
+/// Takes the most significant bit from each 8-bit element in a 64-bit
+///    integer vector to create an 8-bit mask value. Zero-extends the value to
 ///    32-bit integer and writes it to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMOVMSKB instruction.
+/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing the values with bits to be extracted.
-/// \returns The most significant bit from each 8-bit element in the operand,
-///    written to bits [15:0].
-static __inline__ int __DEFAULT_FN_ATTRS
+/// \returns The most significant bit from each 8-bit element in \a __a,
+///    written to bits [7:0].
+static __inline__ int __DEFAULT_FN_ATTRS_MMX
 _mm_movemask_pi8(__m64 __a)
 {
   return __builtin_ia32_pmovmskb((__v8qi)__a);
 }
 
-/// \brief Multiplies packed 16-bit unsigned integer values and writes the
+/// Multiplies packed 16-bit unsigned integer values and writes the
 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
 ///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PMULHUW instruction.
+/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
 ///    destination, as specified by the immediate value operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSHUFW instruction.
-///
 /// \code
 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
 /// \endcode
 ///
+/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
+///
 /// \param a
 ///    A 64-bit integer vector containing the values to be shuffled.
 /// \param n
 ///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from a. The destinations within the 64-bit destination are assigned
-///    values as follows:
-///    Bits [1:0] are used to assign values to bits [15:0] in the destination.
-///    Bits [3:2] are used to assign values to bits [31:16] in the destination.
-///    Bits [5:4] are used to assign values to bits [47:32] in the destination.
-///    Bits [7:6] are used to assign values to bits [63:48] in the destination.
-///    Bit value assignments:
-///    00: assigned from bits [15:0] of a.
-///    01: assigned from bits [31:16] of a.
-///    10: assigned from bits [47:32] of a.
-///    11: assigned from bits [63:48] of a.
+///    copy from \a a. The destinations within the 64-bit destination are
+///    assigned values as follows: \n
+///    Bits [1:0] are used to assign values to bits [15:0] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [31:16] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [47:32] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [63:48] in the
+///    destination. \n
+///    Bit value assignments: \n
+///    00: assigned from bits [15:0] of \a a. \n
+///    01: assigned from bits [31:16] of \a a. \n
+///    10: assigned from bits [47:32] of \a a. \n
+///    11: assigned from bits [63:48] of \a a.
 /// \returns A 64-bit integer vector containing the shuffled values.
-#define _mm_shuffle_pi16(a, n) __extension__ ({ \
-  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
+#define _mm_shuffle_pi16(a, n) \
+  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
 
-/// \brief Conditionally copies the values from each 8-bit element in the first
+/// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
 ///    specified by the most significant bit in the corresponding element in the
-///    second 64-bit integer vector operand. To minimize caching, the data is
-///    flagged as non-temporal (unlikely to be used again soon).
+///    second 64-bit integer vector operand.
+///
+///    To minimize caching, the data is flagged as non-temporal
+///    (unlikely to be used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c MASKMOVQ instruction.
+/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
 ///
 /// \param __d
 ///    A 64-bit integer vector containing the values with elements to be copied.
 /// \param __n
 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
-///    element determines whether the corresponding element in operand __d is
-///    copied. If the most significant bit of a given element is 1, the
-///    corresponding element in operand __d is copied.
+///    element determines whether the corresponding element in operand \a __d
+///    is copied. If the most significant bit of a given element is 1, the
+///    corresponding element in operand \a __d is copied.
 /// \param __p
 ///    A pointer to a 64-bit memory location that will receive the conditionally
 ///    copied integer values. The address of the memory location does not have
 ///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS_MMX
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
 }
 
-/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
+/// Computes the rounded averages of the packed unsigned 8-bit integer
 ///    values and writes the averages to the corresponding bits in the
 ///    destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PAVGB instruction.
+/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
 }
 
-/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
+/// Computes the rounded averages of the packed unsigned 16-bit integer
 ///    values and writes the averages to the corresponding bits in the
 ///    destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PAVGW instruction.
+/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
 }
 
-/// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
+/// Subtracts the corresponding 8-bit unsigned integer values of the two
 ///    64-bit vector operands and computes the absolute value for each of the
 ///    difference. Then sum of the 8 absolute differences is written to the
 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c PSADBW instruction.
+/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit integer vector containing one of the source operands.
@@ -2368,93 +2448,127 @@
 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
 ///    sets of absolute differences between both operands. The upper bits are
 ///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
 }
 
-/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
-///    integer value. There are several groups of macros associated with this
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Returns the contents of the MXCSR register as a 32-bit unsigned
+///    integer value.
+///
+///    There are several groups of macros associated with this
 ///    intrinsic, including:
-///    * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+///    <ul>
+///    <li>
+///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
 ///      _MM_GET_EXCEPTION_STATE().
-///    * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///    </li>
+///    <li>
+///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
-///    * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///    </li>
+///    <li>
+///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
-///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
-///    * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///      _MM_GET_ROUNDING_MODE().
+///    </li>
+///    <li>
+///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
-///    * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///    </li>
+///    <li>
+///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
 ///      _MM_GET_DENORMALS_ZERO_MODE().
+///    </li>
+///    </ul>
 ///
-///    For example, the expression below checks if an overflow exception has
+///    For example, the following expression checks if an overflow exception has
 ///    occurred:
+///    \code
 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
+///    \endcode
 ///
-///    The following example gets the current rounding mode:
+///    The following expression gets the current rounding mode:
+///    \code
 ///      _MM_GET_ROUNDING_MODE()
+///    \endcode
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
+/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
 ///
 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
 ///    register.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_getcsr(void)
-{
-  return __builtin_ia32_stmxcsr();
-}
+unsigned int _mm_getcsr(void);
 
-/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
-///    are several groups of macros associated with this intrinsic, including:
-///    * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+/// Sets the MXCSR register with the 32-bit unsigned integer value.
+///
+///    There are several groups of macros associated with this intrinsic,
+///    including:
+///    <ul>
+///    <li>
+///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
-///    * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///    </li>
+///    <li>
+///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
 ///      of these macros.
-///    * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///    </li>
+///    <li>
+///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
-///    * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///    </li>
+///    <li>
+///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
 ///      one of these macros.
-///    * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///    </li>
+///    <li>
+///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
+///    </li>
+///    </ul>
 ///
 ///    For example, the following expression causes subsequent floating-point
 ///    operations to round up:
 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
 ///
 ///    The following example sets the DAZ and FTZ flags:
-///      void setFlags() {
-///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
-///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
-///      }
+///    \code
+///    void setFlags() {
+///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+///    }
+///    \endcode
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
+/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
 ///
 /// \param __i
 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_setcsr(unsigned int __i)
-{
-  __builtin_ia32_ldmxcsr(__i);
-}
+void _mm_setcsr(unsigned int __i);
 
-/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// Selects 4 float values from the 128-bit operands of [4 x float], as
 ///    specified by the immediate value operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -2463,7 +2577,7 @@
 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
+/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
 ///
 /// \param a
 ///    A 128-bit vector of [4 x float].
@@ -2471,42 +2585,43 @@
 ///    A 128-bit vector of [4 x float].
 /// \param mask
 ///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from a and b.
-///    Bits [3:0] specify the values copied from operand a.
-///    Bits [7:4] specify the values copied from operand b. The destinations
-///    within the 128-bit destination are assigned values as follows:
-///    Bits [1:0] are used to assign values to bits [31:0] in the destination.
-///    Bits [3:2] are used to assign values to bits [63:32] in the destination.
-///    Bits [5:4] are used to assign values to bits [95:64] in the destination.
-///    Bits [7:6] are used to assign values to bits [127:96] in the destination.
-///    Bit value assignments:
-///    00: Bits [31:0] copied from the specified operand.
-///    01: Bits [63:32] copied from the specified operand.
-///    10: Bits [95:64] copied from the specified operand.
+///    copy from \a a and \a b. \n
+///    Bits [3:0] specify the values copied from operand \a a. \n
+///    Bits [7:4] specify the values copied from operand \a b. \n
+///    The destinations within the 128-bit destination are assigned values as
+///    follows: \n
+///    Bits [1:0] are used to assign values to bits [31:0] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [63:32] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [95:64] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [127:96] in the
+///    destination. \n
+///    Bit value assignments: \n
+///    00: Bits [31:0] copied from the specified operand. \n
+///    01: Bits [63:32] copied from the specified operand. \n
+///    10: Bits [95:64] copied from the specified operand. \n
 ///    11: Bits [127:96] copied from the specified operand.
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
-#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
-  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
-                                  0 + (((mask) >> 0) & 0x3), \
-                                  0 + (((mask) >> 2) & 0x3), \
-                                  4 + (((mask) >> 4) & 0x3), \
-                                  4 + (((mask) >> 6) & 0x3)); })
+#define _mm_shuffle_ps(a, b, mask) \
+  (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+                                (int)(mask))
 
-/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
-///    [4 x float] and interleaves them into a 128-bit vector of [4 x
-///    float].
+/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
+/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
 ///
 /// \param __a
-///    A 128-bit vector of [4 x float].
-///    Bits [95:64] are written to bits [31:0] of the destination.
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [95:64] are written to bits [31:0] of the destination. \n
 ///    Bits [127:96] are written to bits [95:64] of the destination.
 /// \param __b
 ///    A 128-bit vector of [4 x float].
-///    Bits [95:64] are written to bits [63:32] of the destination.
+///    Bits [95:64] are written to bits [63:32] of the destination. \n
 ///    Bits [127:96] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2515,21 +2630,20 @@
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
 }
 
-/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
-///    [4 x float] and interleaves them into a 128-bit vector of [4 x
-///    float].
+/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
+/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
 ///
 /// \param __a
-///    A 128-bit vector of [4 x float].
-///    Bits [31:0] are written to bits [31:0] of the destination.
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [31:0] are written to bits [31:0] of the destination.  \n
 ///    Bits [63:32] are written to bits [95:64] of the destination.
 /// \param __b
-///    A 128-bit vector of [4 x float].
-///    Bits [31:0] are written to bits [63:32] of the destination.
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [31:0] are written to bits [63:32] of the destination. \n
 ///    Bits [63:32] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2538,13 +2652,14 @@
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
 ///    96 bits are set to the upper 96 bits of the first parameter.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
+///    instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
@@ -2556,16 +2671,17 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_move_ss(__m128 __a, __m128 __b)
 {
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
+  __a[0] = __b[0];
+  return __a;
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
 ///    64 bits are set to the upper 64 bits of the first parameter.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
@@ -2580,13 +2696,13 @@
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
 }
 
-/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
 ///    64 bits are set to the lower 64 bits of the second parameter.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
@@ -2601,19 +2717,19 @@
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
 }
 
-/// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
+/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
 ///    float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
 ///    from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtpi16_ps(__m64 __a)
 {
   __m64 __b, __c;
@@ -2631,19 +2747,19 @@
   return __r;
 }
 
-/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
+/// Converts a 64-bit vector of 16-bit unsigned integer values into a
 ///    128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
 ///    destination are copied from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtpu16_ps(__m64 __a)
 {
   __m64 __b, __c;
@@ -2660,19 +2776,19 @@
   return __r;
 }
 
-/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
+/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
 ///    into a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
 ///    from the corresponding lower 4 elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtpi8_ps(__m64 __a)
 {
   __m64 __b;
@@ -2684,12 +2800,12 @@
   return _mm_cvtpi16_ps(__b);
 }
 
-/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
+/// Converts the lower four unsigned 8-bit integer values from a 64-bit
 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
@@ -2697,7 +2813,7 @@
 ///    operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtpu8_ps(__m64 __a)
 {
   __m64 __b;
@@ -2708,12 +2824,12 @@
   return _mm_cvtpi16_ps(__b);
 }
 
-/// \brief Converts the two 32-bit signed integer values from each 64-bit vector
+/// Converts the two 32-bit signed integer values from each 64-bit vector
 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
@@ -2724,7 +2840,7 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    copied and converted values from the first operand. The upper 64 bits
 ///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
   __m128 __c;
@@ -2736,23 +2852,24 @@
   return _mm_cvtpi32_ps(__c, __a);
 }
 
-/// \brief Converts each single-precision floating-point element of a 128-bit
+/// Converts each single-precision floating-point element of a 128-bit
 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
-///    packs the results into a 64-bit integer vector of [4 x i16]. If the
-///    floating-point element is NaN or infinity, or if the floating-point
-///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
-///    to 0x8000. Otherwise if the floating-point element is greater
-///    than 0x7FFF, it is converted to 0x7FFF.
+///    packs the results into a 64-bit integer vector of [4 x i16].
+///
+///    If the floating-point element is NaN or infinity, or if the
+///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
+///    it is converted to 0x8000. Otherwise if the floating-point element is
+///    greater than 0x7FFF, it is converted to 0x7FFF.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtps_pi16(__m128 __a)
 {
   __m64 __b, __c;
@@ -2764,24 +2881,25 @@
   return _mm_packs_pi32(__b, __c);
 }
 
-/// \brief Converts each single-precision floating-point element of a 128-bit
+/// Converts each single-precision floating-point element of a 128-bit
 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
-///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
-///    floating-point element is NaN or infinity, or if the floating-point
-///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
-///    to 0x80. Otherwise if the floating-point element is greater
+///    [8 x i8]. The upper 32 bits of the vector are set to 0.
+///
+///    If the floating-point element is NaN or infinity, or if the
+///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
+///    is converted to 0x80. Otherwise if the floating-point element is greater
 ///    than 0x7F, it is converted to 0x7F.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
 ///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_cvtps_pi8(__m128 __a)
 {
   __m64 __b, __c;
@@ -2792,14 +2910,14 @@
   return _mm_packs_pi16(__b, __c);
 }
 
-/// \brief Extracts the sign bits from each single-precision floating-point
+/// Extracts the sign bits from each single-precision floating-point
 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
 ///    to zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
+/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float].
@@ -2884,6 +3002,7 @@
 #define _m_ _mm_
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_MMX
 
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
diff --git a/darwin-x86/clang-headers/xopintrin.h b/darwin-x86/clang-headers/xopintrin.h
index bdf0cec..9d540a2 100644
--- a/darwin-x86/clang-headers/xopintrin.h
+++ b/darwin-x86/clang-headers/xopintrin.h
@@ -31,7 +31,8 @@
 #include <fma4intrin.h>
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256)))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
@@ -198,13 +199,13 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
 {
-  return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
+  return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
 {
-  return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
+  return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -237,17 +238,17 @@
   return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
 }
 
-#define _mm_roti_epi8(A, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
+#define _mm_roti_epi8(A, N) \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))
 
-#define _mm_roti_epi16(A, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
+#define _mm_roti_epi16(A, N) \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))
 
-#define _mm_roti_epi32(A, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
+#define _mm_roti_epi32(A, N) \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))
 
-#define _mm_roti_epi64(A, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
+#define _mm_roti_epi64(A, N) \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shl_epi8(__m128i __A, __m128i __B)
@@ -297,37 +298,37 @@
   return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
 }
 
-#define _mm_com_epu8(A, B, N) __extension__ ({ \
+#define _mm_com_epu8(A, B, N) \
   (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
-                                  (__v16qi)(__m128i)(B), (N)); })
+                                  (__v16qi)(__m128i)(B), (N))
 
-#define _mm_com_epu16(A, B, N) __extension__ ({ \
+#define _mm_com_epu16(A, B, N) \
   (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
-                                  (__v8hi)(__m128i)(B), (N)); })
+                                  (__v8hi)(__m128i)(B), (N))
 
-#define _mm_com_epu32(A, B, N) __extension__ ({ \
+#define _mm_com_epu32(A, B, N) \
   (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
-                                  (__v4si)(__m128i)(B), (N)); })
+                                  (__v4si)(__m128i)(B), (N))
 
-#define _mm_com_epu64(A, B, N) __extension__ ({ \
+#define _mm_com_epu64(A, B, N) \
   (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
-                                  (__v2di)(__m128i)(B), (N)); })
+                                  (__v2di)(__m128i)(B), (N))
 
-#define _mm_com_epi8(A, B, N) __extension__ ({ \
+#define _mm_com_epi8(A, B, N) \
   (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
-                                 (__v16qi)(__m128i)(B), (N)); })
+                                 (__v16qi)(__m128i)(B), (N))
 
-#define _mm_com_epi16(A, B, N) __extension__ ({ \
+#define _mm_com_epi16(A, B, N) \
   (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
-                                 (__v8hi)(__m128i)(B), (N)); })
+                                 (__v8hi)(__m128i)(B), (N))
 
-#define _mm_com_epi32(A, B, N) __extension__ ({ \
+#define _mm_com_epi32(A, B, N) \
   (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
-                                 (__v4si)(__m128i)(B), (N)); })
+                                 (__v4si)(__m128i)(B), (N))
 
-#define _mm_com_epi64(A, B, N) __extension__ ({ \
+#define _mm_com_epi64(A, B, N) \
   (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
-                                 (__v2di)(__m128i)(B), (N)); })
+                                 (__v2di)(__m128i)(B), (N))
 
 #define _MM_PCOMCTRL_LT    0
 #define _MM_PCOMCTRL_LE    1
@@ -722,24 +723,24 @@
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
-#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+#define _mm_permute2_pd(X, Y, C, I) \
   (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
                                      (__v2df)(__m128d)(Y), \
-                                     (__v2di)(__m128i)(C), (I)); })
+                                     (__v2di)(__m128i)(C), (I))
 
-#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+#define _mm256_permute2_pd(X, Y, C, I) \
   (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
                                         (__v4df)(__m256d)(Y), \
-                                        (__v4di)(__m256i)(C), (I)); })
+                                        (__v4di)(__m256i)(C), (I))
 
-#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+#define _mm_permute2_ps(X, Y, C, I) \
   (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
-                                    (__v4si)(__m128i)(C), (I)); })
+                                    (__v4si)(__m128i)(C), (I))
 
-#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+#define _mm256_permute2_ps(X, Y, C, I) \
   (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
                                        (__v8sf)(__m256)(Y), \
-                                       (__v8si)(__m256i)(C), (I)); })
+                                       (__v8si)(__m256i)(C), (I))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_frcz_ss(__m128 __A)
@@ -765,18 +766,19 @@
   return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_frcz_ps(__m256 __A)
 {
   return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_frcz_pd(__m256d __A)
 {
   return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS256
 
 #endif /* __XOPINTRIN_H */
diff --git a/darwin-x86/clang-headers/xsavecintrin.h b/darwin-x86/clang-headers/xsavecintrin.h
index 598470a..25577a9 100644
--- a/darwin-x86/clang-headers/xsavecintrin.h
+++ b/darwin-x86/clang-headers/xsavecintrin.h
@@ -1,4 +1,4 @@
-/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
+/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/darwin-x86/clang-headers/xsaveintrin.h b/darwin-x86/clang-headers/xsaveintrin.h
index a2e6b2e..16f3a78 100644
--- a/darwin-x86/clang-headers/xsaveintrin.h
+++ b/darwin-x86/clang-headers/xsaveintrin.h
@@ -1,4 +1,4 @@
-/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
+/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,23 +33,23 @@
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _xsave(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xsave(__p, __m);
+  __builtin_ia32_xsave(__p, __m);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _xrstor(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xrstor(__p, __m);
+  __builtin_ia32_xrstor(__p, __m);
 }
 
 #ifdef __x86_64__
 static __inline__ void __DEFAULT_FN_ATTRS
 _xsave64(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xsave64(__p, __m);
+  __builtin_ia32_xsave64(__p, __m);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _xrstor64(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xrstor64(__p, __m);
+  __builtin_ia32_xrstor64(__p, __m);
 }
 #endif
 
diff --git a/darwin-x86/clang-headers/xsaveoptintrin.h b/darwin-x86/clang-headers/xsaveoptintrin.h
index d3faae7..792cf92 100644
--- a/darwin-x86/clang-headers/xsaveoptintrin.h
+++ b/darwin-x86/clang-headers/xsaveoptintrin.h
@@ -1,4 +1,4 @@
-/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
+/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,13 +33,13 @@
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _xsaveopt(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xsaveopt(__p, __m);
+  __builtin_ia32_xsaveopt(__p, __m);
 }
 
 #ifdef __x86_64__
 static __inline__ void __DEFAULT_FN_ATTRS
 _xsaveopt64(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xsaveopt64(__p, __m);
+  __builtin_ia32_xsaveopt64(__p, __m);
 }
 #endif
 
diff --git a/darwin-x86/clang-headers/xsavesintrin.h b/darwin-x86/clang-headers/xsavesintrin.h
index c5e540a..fe2bc4b 100644
--- a/darwin-x86/clang-headers/xsavesintrin.h
+++ b/darwin-x86/clang-headers/xsavesintrin.h
@@ -1,4 +1,4 @@
-/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
+/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/darwin-x86/clang-headers/xtestintrin.h b/darwin-x86/clang-headers/xtestintrin.h
index 9d3378f..9244243 100644
--- a/darwin-x86/clang-headers/xtestintrin.h
+++ b/darwin-x86/clang-headers/xtestintrin.h
@@ -1,4 +1,4 @@
-/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------===
+/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal