Update RenderScript prebuilts for Linux after rebase to LLVM r256229.

Bug: http://b/26987366

Change-Id: I8a33248ceb71e5a9122c6fc6acaac42688cd2f07
diff --git a/renderscript/clang-include/Intrin.h b/renderscript/clang-include/Intrin.h
index 727a55e..6c1d0d1 100644
--- a/renderscript/clang-include/Intrin.h
+++ b/renderscript/clang-include/Intrin.h
@@ -39,6 +39,9 @@
 #include <setjmp.h>
 #endif
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -46,10 +49,7 @@
 #if defined(__MMX__)
 /* And the random ones that aren't in those files. */
 __m64 _m_from_float(float);
-__m64 _m_from_int(int _l);
-void _m_prefetch(void *);
 float _m_to_float(__m64);
-int _m_to_int(__m64 _M);
 #endif
 
 /* Other assorted instruction intrinsics. */
@@ -177,8 +177,6 @@
 unsigned short __cdecl _byteswap_ushort(unsigned short);
 void __cdecl _disable(void);
 void __cdecl _enable(void);
-void __cdecl _fxrstor(void const *);
-void __cdecl _fxsave(void *);
 long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
 static __inline__
 long _InterlockedAnd(long volatile *_Value, long _Mask);
@@ -291,9 +289,6 @@
 static __inline__
 #define _XCR_XFEATURE_ENABLED_MASK 0
 unsigned __int64 __cdecl _xgetbv(unsigned int);
-void __cdecl _xrstor(void const *, unsigned __int64);
-void __cdecl _xsave(void *, unsigned __int64);
-void __cdecl _xsaveopt(void *, unsigned __int64);
 void __cdecl _xsetbv(unsigned int, unsigned __int64);
 
 /* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
@@ -355,8 +350,6 @@
 static __inline__
 unsigned char _bittestandset64(__int64 *, __int64);
 unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
-void __cdecl _fxrstor64(void const *);
-void __cdecl _fxsave64(void *);
 long _InterlockedAnd_np(long volatile *_Value, long _Mask);
 short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
 __int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
@@ -421,7 +414,7 @@
  * Multiply two 64-bit integers and obtain a 64-bit result.
  * The low-half is returned directly and the high half is in an out parameter.
  */
-static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
 _umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
          unsigned __int64 *_HighProduct) {
   unsigned __int128 _FullProduct =
@@ -429,69 +422,77 @@
   *_HighProduct = _FullProduct >> 64;
   return _FullProduct;
 }
-static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
 __umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
   unsigned __int128 _FullProduct =
       (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
   return _FullProduct >> 64;
 }
-void __cdecl _xrstor64(void const *, unsigned __int64);
-void __cdecl _xsave64(void *, unsigned __int64);
-void __cdecl _xsaveopt64(void *, unsigned __int64);
 
 #endif /* __x86_64__ */
 
 /*----------------------------------------------------------------------------*\
+|* Multiplication
+\*----------------------------------------------------------------------------*/
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+__emul(int __in1, int __in2) {
+  return (__int64)__in1 * (__int64)__in2;
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__emulu(unsigned int __in1, unsigned int __in2) {
+  return (unsigned __int64)__in1 * (unsigned __int64)__in2;
+}
+/*----------------------------------------------------------------------------*\
 |* Bit Twiddling
 \*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _rotl8(unsigned char _Value, unsigned char _Shift) {
   _Shift &= 0x7;
   return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _rotr8(unsigned char _Value, unsigned char _Shift) {
   _Shift &= 0x7;
   return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
 }
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
 _rotl16(unsigned short _Value, unsigned char _Shift) {
   _Shift &= 0xf;
   return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
 }
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
 _rotr16(unsigned short _Value, unsigned char _Shift) {
   _Shift &= 0xf;
   return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
 }
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _rotl(unsigned int _Value, int _Shift) {
   _Shift &= 0x1f;
   return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
 }
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _rotr(unsigned int _Value, int _Shift) {
   _Shift &= 0x1f;
   return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
 }
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
 _lrotl(unsigned long _Value, int _Shift) {
   _Shift &= 0x1f;
   return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
 }
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
 _lrotr(unsigned long _Value, int _Shift) {
   _Shift &= 0x1f;
   return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
 }
 static
-__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
 _rotl64(unsigned __int64 _Value, int _Shift) {
   _Shift &= 0x3f;
   return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
 }
 static
-__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
 _rotr64(unsigned __int64 _Value, int _Shift) {
   _Shift &= 0x3f;
   return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
@@ -499,71 +500,64 @@
 /*----------------------------------------------------------------------------*\
 |* Bit Counting and Testing
 \*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _BitScanForward(unsigned long *_Index, unsigned long _Mask) {
   if (!_Mask)
     return 0;
   *_Index = __builtin_ctzl(_Mask);
   return 1;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
   if (!_Mask)
     return 0;
   *_Index = 31 - __builtin_clzl(_Mask);
   return 1;
 }
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
-__popcnt16(unsigned short value) {
-  return __builtin_popcount((int)value);
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__popcnt16(unsigned short _Value) {
+  return __builtin_popcount((int)_Value);
 }
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__popcnt(unsigned int value) {
-  return __builtin_popcount(value);
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__popcnt(unsigned int _Value) {
+  return __builtin_popcount(_Value);
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_bittest(long const *a, long b) {
-  return (*a >> b) & 1;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest(long const *_BitBase, long _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_bittestandcomplement(long *a, long b) {
-  unsigned char x = (*a >> b) & 1;
-  *a = *a ^ (1 << b);
-  return x;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1 << _BitPos);
+  return _Res;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_bittestandreset(long *a, long b) {
-  unsigned char x = (*a >> b) & 1;
-  *a = *a & ~(1 << b);
-  return x;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1 << _BitPos);
+  return _Res;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_bittestandset(long *a, long b) {
-  unsigned char x = (*a >> b) & 1;
-  *a = *a | (1 << b);
-  return x;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1 << _BitPos);
+  return _Res;
 }
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_interlockedbittestandset(long volatile *__BitBase, long __BitPos) {
-  unsigned char __Res;
-  __asm__ ("xor %0, %0\n"
-           "lock bts %2, %1\n"
-           "setc %0\n"
-           : "=r" (__Res), "+m"(*__BitBase)
-           : "Ir"(__BitPos));
-  return __Res;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
 }
-#endif
 #ifdef __x86_64__
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
   if (!_Mask)
     return 0;
   *_Index = __builtin_ctzll(_Mask);
   return 1;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
   if (!_Mask)
     return 0;
@@ -571,235 +565,232 @@
   return 1;
 }
 static __inline__
-unsigned __int64 __attribute__((__always_inline__, __nodebug__))
- __popcnt64(unsigned __int64 value) {
-  return __builtin_popcountll(value);
+unsigned __int64 __DEFAULT_FN_ATTRS
+__popcnt64(unsigned __int64 _Value) {
+  return __builtin_popcountll(_Value);
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_bittest64(__int64 const *a, __int64 b) {
-  return (*a >> b) & 1;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_bittestandcomplement64(__int64 *a, __int64 b) {
-  unsigned char x = (*a >> b) & 1;
-  *a = *a ^ (1ll << b);
-  return x;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1ll << _BitPos);
+  return _Res;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_bittestandreset64(__int64 *a, __int64 b) {
-  unsigned char x = (*a >> b) & 1;
-  *a = *a & ~(1ll << b);
-  return x;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1ll << _BitPos);
+  return _Res;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_bittestandset64(__int64 *a, __int64 b) {
-  unsigned char x = (*a >> b) & 1;
-  *a = *a | (1ll << b);
-  return x;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1ll << _BitPos);
+  return _Res;
 }
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
-_interlockedbittestandset64(__int64 volatile *__BitBase, __int64 __BitPos) {
-  unsigned char __Res;
-  __asm__ ("xor %0, %0\n"
-           "lock bts %2, %1\n"
-           "setc %0\n"
-           : "=r" (__Res), "+m"(*__BitBase)
-           : "Ir"(__BitPos));
-  return __Res;
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
+  long long _PrevVal =
+      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Exchange Add
 \*----------------------------------------------------------------------------*/
-static __inline__ char __attribute__((__always_inline__, __nodebug__))
+static __inline__ char __DEFAULT_FN_ATTRS
 _InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
-  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
 }
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
-  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
 }
 #ifdef __x86_64__
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
-  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Exchange Sub
 \*----------------------------------------------------------------------------*/
-static __inline__ char __attribute__((__always_inline__, __nodebug__))
+static __inline__ char __DEFAULT_FN_ATTRS
 _InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
-  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
 }
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
-  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
 }
-static __inline__ long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedExchangeSub(long volatile *_Subend, long _Value) {
-  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
 }
 #ifdef __x86_64__
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
-  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Increment
 \*----------------------------------------------------------------------------*/
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedIncrement16(short volatile *_Value) {
-  return __atomic_add_fetch(_Value, 1, 0);
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
 }
 #ifdef __x86_64__
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedIncrement64(__int64 volatile *_Value) {
-  return __atomic_add_fetch(_Value, 1, 0);
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Decrement
 \*----------------------------------------------------------------------------*/
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedDecrement16(short volatile *_Value) {
-  return __atomic_sub_fetch(_Value, 1, 0);
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
 }
 #ifdef __x86_64__
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedDecrement64(__int64 volatile *_Value) {
-  return __atomic_sub_fetch(_Value, 1, 0);
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked And
 \*----------------------------------------------------------------------------*/
-static __inline__ char __attribute__((__always_inline__, __nodebug__))
+static __inline__ char __DEFAULT_FN_ATTRS
 _InterlockedAnd8(char volatile *_Value, char _Mask) {
-  return __atomic_and_fetch(_Value, _Mask, 0);
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedAnd16(short volatile *_Value, short _Mask) {
-  return __atomic_and_fetch(_Value, _Mask, 0);
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
-static __inline__ long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedAnd(long volatile *_Value, long _Mask) {
-  return __atomic_and_fetch(_Value, _Mask, 0);
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
 #ifdef __x86_64__
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_and_fetch(_Value, _Mask, 0);
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Or
 \*----------------------------------------------------------------------------*/
-static __inline__ char __attribute__((__always_inline__, __nodebug__))
+static __inline__ char __DEFAULT_FN_ATTRS
 _InterlockedOr8(char volatile *_Value, char _Mask) {
-  return __atomic_or_fetch(_Value, _Mask, 0);
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedOr16(short volatile *_Value, short _Mask) {
-  return __atomic_or_fetch(_Value, _Mask, 0);
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
-static __inline__ long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedOr(long volatile *_Value, long _Mask) {
-  return __atomic_or_fetch(_Value, _Mask, 0);
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
 #ifdef __x86_64__
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_or_fetch(_Value, _Mask, 0);
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Xor
 \*----------------------------------------------------------------------------*/
-static __inline__ char __attribute__((__always_inline__, __nodebug__))
+static __inline__ char __DEFAULT_FN_ATTRS
 _InterlockedXor8(char volatile *_Value, char _Mask) {
-  return __atomic_xor_fetch(_Value, _Mask, 0);
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedXor16(short volatile *_Value, short _Mask) {
-  return __atomic_xor_fetch(_Value, _Mask, 0);
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
-static __inline__ long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedXor(long volatile *_Value, long _Mask) {
-  return __atomic_xor_fetch(_Value, _Mask, 0);
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
 #ifdef __x86_64__
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
-  return __atomic_xor_fetch(_Value, _Mask, 0);
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Exchange
 \*----------------------------------------------------------------------------*/
-static __inline__ char __attribute__((__always_inline__, __nodebug__))
+static __inline__ char __DEFAULT_FN_ATTRS
 _InterlockedExchange8(char volatile *_Target, char _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
   return _Value;
 }
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedExchange16(short volatile *_Target, short _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
   return _Value;
 }
 #ifdef __x86_64__
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
-  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
   return _Value;
 }
 #endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Compare Exchange
 \*----------------------------------------------------------------------------*/
-static __inline__ char __attribute__((__always_inline__, __nodebug__))
+static __inline__ char __DEFAULT_FN_ATTRS
 _InterlockedCompareExchange8(char volatile *_Destination,
                              char _Exchange, char _Comparand) {
-  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
   return _Comparand;
 }
-static __inline__ short __attribute__((__always_inline__, __nodebug__))
+static __inline__ short __DEFAULT_FN_ATTRS
 _InterlockedCompareExchange16(short volatile *_Destination,
                               short _Exchange, short _Comparand) {
-  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
   return _Comparand;
 }
-static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedCompareExchange64(__int64 volatile *_Destination,
                               __int64 _Exchange, __int64 _Comparand) {
-  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
   return _Comparand;
 }
 /*----------------------------------------------------------------------------*\
 |* Barriers
 \*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
 _ReadWriteBarrier(void) {
-  __asm__ volatile ("" : : : "memory");
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
 _ReadBarrier(void) {
-  __asm__ volatile ("" : : : "memory");
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
 _WriteBarrier(void) {
-  __asm__ volatile ("" : : : "memory");
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
 }
-#endif
 #ifdef __x86_64__
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __faststorefence(void) {
-  __asm__ volatile("lock orq $0, (%%rsp)" : : : "memory");
+  __atomic_thread_fence(__ATOMIC_SEQ_CST);
 }
 #endif
 /*----------------------------------------------------------------------------*\
@@ -812,33 +803,33 @@
     (__offset))
 
 #ifdef __i386__
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
 __readfsbyte(unsigned long __offset) {
   return *__ptr_to_addr_space(257, unsigned char, __offset);
 }
-static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
 __readfsqword(unsigned long __offset) {
   return *__ptr_to_addr_space(257, unsigned __int64, __offset);
 }
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
 __readfsword(unsigned long __offset) {
   return *__ptr_to_addr_space(257, unsigned short, __offset);
 }
 #endif
 #ifdef __x86_64__
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
 __readgsbyte(unsigned long __offset) {
   return *__ptr_to_addr_space(256, unsigned char, __offset);
 }
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
 __readgsdword(unsigned long __offset) {
   return *__ptr_to_addr_space(256, unsigned long, __offset);
 }
-static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
 __readgsqword(unsigned long __offset) {
   return *__ptr_to_addr_space(256, unsigned __int64, __offset);
 }
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
 __readgsword(unsigned long __offset) {
   return *__ptr_to_addr_space(256, unsigned short, __offset);
 }
@@ -848,44 +839,44 @@
 |* movs, stos
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
   __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
                         : "%edi", "%esi", "%ecx");
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
   __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
                         : "%edi", "%esi", "%ecx");
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__("rep movsh" : : "D"(__dst), "S"(__src), "c"(__n)
+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)
                         : "%edi", "%esi", "%ecx");
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
   __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
                         : "%edi", "%ecx");
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
   __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
                         : "%edi", "%ecx");
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__("rep stosh" : : "D"(__dst), "a"(__x), "c"(__n)
+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)
                         : "%edi", "%ecx");
 }
 #endif
 #ifdef __x86_64__
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
   __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
                         : "%edi", "%esi", "%ecx");
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
   __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
                         : "%edi", "%ecx");
@@ -895,32 +886,32 @@
 /*----------------------------------------------------------------------------*\
 |* Misc
 \*----------------------------------------------------------------------------*/
-static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+static __inline__ void * __DEFAULT_FN_ATTRS
 _AddressOfReturnAddress(void) {
   return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
 }
-static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+static __inline__ void * __DEFAULT_FN_ATTRS
 _ReturnAddress(void) {
   return __builtin_return_address(0);
 }
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __cpuid(int __info[4], int __level) {
   __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
                    : "a"(__level));
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __cpuidex(int __info[4], int __level, int __ecx) {
   __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
                    : "a"(__level), "c"(__ecx));
 }
-static __inline__ unsigned __int64 __cdecl __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
 _xgetbv(unsigned int __xcr_no) {
   unsigned int __eax, __edx;
   __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
   return ((unsigned __int64)__edx << 32) | __eax;
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __halt(void) {
   __asm__ volatile ("hlt");
 }
@@ -930,7 +921,7 @@
 |* Privileged intrinsics
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
 __readmsr(unsigned long __register) {
   // Loads the contents of a 64-bit model specific register (MSR) specified in
   // the ECX register into registers EDX:EAX. The EDX register is loaded with
@@ -944,14 +935,14 @@
   return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
 }
 
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
 __readcr3(void) {
   unsigned long __cr3_val;
   __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
   return __cr3_val;
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __writecr3(unsigned int __cr3_val) {
   __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
 }
@@ -961,5 +952,7 @@
 }
 #endif
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __INTRIN_H */
 #endif /* _MSC_VER */
diff --git a/renderscript/clang-include/__clang_cuda_runtime_wrapper.h b/renderscript/clang-include/__clang_cuda_runtime_wrapper.h
new file mode 100644
index 0000000..8e5f033
--- /dev/null
+++ b/renderscript/clang-include/__clang_cuda_runtime_wrapper.h
@@ -0,0 +1,216 @@
+/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * WARNING: This header is intended to be directly -include'd by
+ * the compiler and is not supposed to be included by users.
+ *
+ * CUDA headers are implemented in a way that currently makes it
+ * impossible for user code to #include directly when compiling with
+ * Clang. They present different view of CUDA-supplied functions
+ * depending on where in NVCC's compilation pipeline the headers are
+ * included. Neither of these modes provides function definitions with
+ * correct attributes, so we use preprocessor to force the headers
+ * into a form that Clang can use.
+ *
+ * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
+ * this file during every CUDA compilation.
+ */
+
+#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
+#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
+
+#if defined(__CUDA__) && defined(__clang__)
+
+// Include some standard headers to avoid CUDA headers including them
+// while some required macros (like __THROW) are in a weird state.
+#include <stdlib.h>
+#include <cmath>
+
+// Preserve common macros that will be changed below by us or by CUDA
+// headers.
+#pragma push_macro("__THROW")
+#pragma push_macro("__CUDA_ARCH__")
+
+// WARNING: Preprocessor hacks below are based on specific details of
+// CUDA-7.x headers and are not expected to work with any other
+// version of CUDA headers.
+#include "cuda.h"
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
+#error "Unsupported CUDA version!"
+#endif
+
+// Make largest subset of device functions available during host
+// compilation -- SM_35 for the time being.
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 350
+#endif
+
+#include "cuda_builtin_vars.h"
+
+// No need for device_launch_parameters.h as cuda_builtin_vars.h above
+// has taken care of builtin variables declared in the file.
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+// {math,device}_functions.h only have declarations of the
+// functions. We don't need them as we're going to pull in their
+// definitions from .hpp files.
+#define __DEVICE_FUNCTIONS_H__
+#define __MATH_FUNCTIONS_H__
+
+#undef __CUDACC__
+#define __CUDABE__
+// Disables definitions of device-side runtime support stubs in
+// cuda_device_runtime_api.h
+#define __CUDADEVRT_INTERNAL__
+#include "host_config.h"
+#include "host_defines.h"
+#include "driver_types.h"
+#include "common_functions.h"
+#undef __CUDADEVRT_INTERNAL__
+
+#undef __CUDABE__
+#define __CUDACC__
+#include "cuda_runtime.h"
+
+#undef __CUDACC__
+#define __CUDABE__
+
+// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
+// not have at the moment. Emulate them with a builtin memcpy/memset.
+#define __nvvm_memcpy(s,d,n,a) __builtin_memcpy(s,d,n)
+#define __nvvm_memset(d,c,n,a) __builtin_memset(d,c,n)
+
+#include "crt/host_runtime.h"
+#include "crt/device_runtime.h"
+// device_runtime.h defines __cxa_* macros that will conflict with
+// cxxabi.h.
+// FIXME: redefine these as __device__ functions.
+#undef __cxa_vec_ctor
+#undef __cxa_vec_cctor
+#undef __cxa_vec_dtor
+#undef __cxa_vec_new2
+#undef __cxa_vec_new3
+#undef __cxa_vec_delete2
+#undef __cxa_vec_delete
+#undef __cxa_vec_delete3
+#undef __cxa_pure_virtual
+
+// We need decls for functions in CUDA's libdevice with __device__
+// attribute only. Alas they come either as __host__ __device__ or
+// with no attributes at all. To work around that, define __CUDA_RTC__
+// which produces HD variant and undef __host__ which gives us desided
+// decls with __device__ attribute.
+#pragma push_macro("__host__")
+#define __host__
+#define __CUDACC_RTC__
+#include "device_functions_decls.h"
+#undef __CUDACC_RTC__
+
+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
+// device_functions.hpp and math_functions*.hpp use 'static
+// __forceinline__' (with no __device__) for definitions of device
+// functions. Temporarily redefine __forceinline__ to include
+// __device__.
+#pragma push_macro("__forceinline__")
+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+#include "device_functions.hpp"
+#include "math_functions.hpp"
+#include "math_functions_dbl_ptx3.hpp"
+#pragma pop_macro("__forceinline__")
+
+// Pull in host-only functions that are only available when neither
+// __CUDACC__ nor __CUDABE__ are defined.
+#undef __MATH_FUNCTIONS_HPP__
+#undef __CUDABE__
+#include "math_functions.hpp"
+// Alas, additional overloads for these functions are hard to get to.
+// Considering that we only need these overloads for a few functions,
+// we can provide them here.
+static inline float rsqrt(float a) { return rsqrtf(a); }
+static inline float rcbrt(float a) { return rcbrtf(a); }
+static inline float sinpi(float a) { return sinpif(a); }
+static inline float cospi(float a) { return cospif(a); }
+static inline void sincospi(float a, float *b, float *c) {
+  return sincospi(a, b, c);
+}
+static inline float erfcinv(float a) { return erfcinvf(a); }
+static inline float normcdfinv(float a) { return normcdfinvf(a); }
+static inline float normcdf(float a) { return normcdff(a); }
+static inline float erfcx(float a) { return erfcxf(a); }
+
+// For some reason single-argument variant is not always declared by
+// CUDA headers. Alas, device_functions.hpp included below needs it.
+static inline __device__ void __brkpt(int c) { __brkpt(); }
+
+// Now include *.hpp with definitions of various GPU functions.  Alas,
+// a lot of thins get declared/defined with __host__ attribute which
+// we don't want and we have to define it out. We also have to include
+// {device,math}_functions.hpp again in order to extract the other
+// branch of #if/else inside.
+
+#define __host__
+#undef __CUDABE__
+#define __CUDACC__
+#undef __DEVICE_FUNCTIONS_HPP__
+#include "device_functions.hpp"
+#include "device_atomic_functions.hpp"
+#include "sm_20_atomic_functions.hpp"
+#include "sm_32_atomic_functions.hpp"
+#include "sm_20_intrinsics.hpp"
+// sm_30_intrinsics.h has declarations that use default argument, so
+// we have to include it and it will in turn include .hpp
+#include "sm_30_intrinsics.h"
+#include "sm_32_intrinsics.hpp"
+#undef __MATH_FUNCTIONS_HPP__
+#include "math_functions.hpp"
+#pragma pop_macro("__host__")
+
+#include "texture_indirect_functions.h"
+
+// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
+#pragma pop_macro("__CUDA_ARCH__")
+#pragma pop_macro("__THROW")
+
+// Set up compiler macros expected to be seen during compilation.
+#undef __CUDABE__
+#define __CUDACC__
+#define __NVCC__
+
+#if defined(__CUDA_ARCH__)
+// We need to emit IR declaration for non-existing __nvvm_reflect() to
+// let backend know that it should be treated as const nothrow
+// function which is what NVVMReflect pass expects to see.
+extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *);
+static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
+  return __nvvm_reflect("NONE");
+}
+#endif
+
+#endif // __CUDA__
+#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/renderscript/clang-include/__wmmintrin_aes.h b/renderscript/clang-include/__wmmintrin_aes.h
index 2bfa027..100799e 100644
--- a/renderscript/clang-include/__wmmintrin_aes.h
+++ b/renderscript/clang-include/__wmmintrin_aes.h
@@ -25,43 +25,42 @@
 
 #include <emmintrin.h>
 
-#if !defined (__AES__)
-#  error "AES instructions not enabled"
-#else
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesenc_si128(__m128i __V, __m128i __R)
 {
   return (__m128i)__builtin_ia32_aesenc128(__V, __R);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesenclast_si128(__m128i __V, __m128i __R)
 {
   return (__m128i)__builtin_ia32_aesenclast128(__V, __R);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesdec_si128(__m128i __V, __m128i __R)
 {
   return (__m128i)__builtin_ia32_aesdec128(__V, __R);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesdeclast_si128(__m128i __V, __m128i __R)
 {
   return (__m128i)__builtin_ia32_aesdeclast128(__V, __R);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesimc_si128(__m128i __V)
 {
   return (__m128i)__builtin_ia32_aesimc128(__V);
 }
 
 #define _mm_aeskeygenassist_si128(C, R) \
-  __builtin_ia32_aeskeygenassist128((C), (R))
+  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
 
-#endif
+#undef __DEFAULT_FN_ATTRS
 
 #endif  /* _WMMINTRIN_AES_H */
diff --git a/renderscript/clang-include/__wmmintrin_pclmul.h b/renderscript/clang-include/__wmmintrin_pclmul.h
index 8d1f1b7..68e944e 100644
--- a/renderscript/clang-include/__wmmintrin_pclmul.h
+++ b/renderscript/clang-include/__wmmintrin_pclmul.h
@@ -1,4 +1,4 @@
-/*===---- __wmmintrin_pclmul.h - AES intrinsics ----------------------------===
+/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -23,12 +23,8 @@
 #ifndef _WMMINTRIN_PCLMUL_H
 #define _WMMINTRIN_PCLMUL_H
 
-#if !defined (__PCLMUL__)
-# error "PCLMUL instruction is not enabled"
-#else
 #define _mm_clmulepi64_si128(__X, __Y, __I) \
   ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
                                         (__v2di)(__m128i)(__Y), (char)(__I)))
-#endif
 
 #endif /* _WMMINTRIN_PCLMUL_H */
diff --git a/renderscript/clang-include/adxintrin.h b/renderscript/clang-include/adxintrin.h
index 9db8bcb..ee34728 100644
--- a/renderscript/clang-include/adxintrin.h
+++ b/renderscript/clang-include/adxintrin.h
@@ -28,9 +28,11 @@
 #ifndef __ADXINTRIN_H
 #define __ADXINTRIN_H
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
 /* Intrinsics that are available only if __ADX__ defined */
-#ifdef __ADX__
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
 _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
                unsigned int *__p)
 {
@@ -38,17 +40,16 @@
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
 _addcarryx_u64(unsigned char __cf, unsigned long long __x,
                unsigned long long __y, unsigned long long  *__p)
 {
   return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif
-#endif
 
 /* Intrinsics that are also available if __ADX__ undefined */
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline unsigned char __DEFAULT_FN_ATTRS
 _addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
               unsigned int *__p)
 {
@@ -56,7 +57,7 @@
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline unsigned char __DEFAULT_FN_ATTRS
 _addcarry_u64(unsigned char __cf, unsigned long long __x,
               unsigned long long __y, unsigned long long  *__p)
 {
@@ -64,7 +65,7 @@
 }
 #endif
 
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline unsigned char __DEFAULT_FN_ATTRS
 _subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
               unsigned int *__p)
 {
@@ -72,7 +73,7 @@
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline unsigned char __DEFAULT_FN_ATTRS
 _subborrow_u64(unsigned char __cf, unsigned long long __x,
                unsigned long long __y, unsigned long long  *__p)
 {
@@ -80,4 +81,6 @@
 }
 #endif
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __ADXINTRIN_H */
diff --git a/renderscript/clang-include/altivec.h b/renderscript/clang-include/altivec.h
index 28df890..c39156e 100644
--- a/renderscript/clang-include/altivec.h
+++ b/renderscript/clang-include/altivec.h
@@ -48,7 +48,8 @@
                                               vector bool char __b,
                                               vector unsigned char __c);
 
-static vector short __ATTRS_o_ai vec_perm(vector short __a, vector short __b,
+static vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                          vector signed short __b,
                                           vector unsigned char __c);
 
 static vector unsigned short __ATTRS_o_ai vec_perm(vector unsigned short __a,
@@ -62,7 +63,8 @@
 static vector pixel __ATTRS_o_ai vec_perm(vector pixel __a, vector pixel __b,
                                           vector unsigned char __c);
 
-static vector int __ATTRS_o_ai vec_perm(vector int __a, vector int __b,
+static vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                        vector signed int __b,
                                         vector unsigned char __c);
 
 static vector unsigned int __ATTRS_o_ai vec_perm(vector unsigned int __a,
@@ -77,14 +79,18 @@
                                           vector unsigned char __c);
 
 #ifdef __VSX__
-static vector long long __ATTRS_o_ai vec_perm(vector long long __a,
-                                              vector long long __b,
+static vector long long __ATTRS_o_ai vec_perm(vector signed long long __a,
+                                              vector signed long long __b,
                                               vector unsigned char __c);
 
 static vector unsigned long long __ATTRS_o_ai
 vec_perm(vector unsigned long long __a, vector unsigned long long __b,
          vector unsigned char __c);
 
+static vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c);
+
 static vector double __ATTRS_o_ai vec_perm(vector double __a, vector double __b,
                                            vector unsigned char __c);
 #endif
@@ -110,14 +116,28 @@
   return __builtin_altivec_vmaxsw(__a, -__a);
 }
 
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed long long __ATTRS_o_ai
+vec_abs(vector signed long long __a) {
+  return __builtin_altivec_vmaxsd(__a, -__a);
+}
+#endif
+
 static vector float __ATTRS_o_ai vec_abs(vector float __a) {
   vector unsigned int __res =
       (vector unsigned int)__a & (vector unsigned int)(0x7FFFFFFF);
   return (vector float)__res;
 }
 
-/* vec_abss */
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector double __ATTRS_o_ai vec_abs(vector double __a) {
+  vector unsigned long long __res = { 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF };
+  __res &= (vector unsigned int)__a;
+  return (vector double)__res;
+}
+#endif
 
+/* vec_abss */
 #define __builtin_altivec_abss_v16qi vec_abss
 #define __builtin_altivec_abss_v8hi vec_abss
 #define __builtin_altivec_abss_v4si vec_abss
@@ -226,6 +246,16 @@
 }
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed long long __ATTRS_o_ai
+vec_add(vector signed long long __a, vector signed long long __b) {
+  return __a + __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_add(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a + __b;
+}
+
 static vector signed __int128 __ATTRS_o_ai vec_add(vector signed __int128 __a,
                                                    vector signed __int128 __b) {
   return __a + __b;
@@ -241,6 +271,45 @@
   return __a + __b;
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_add(vector double __a, vector double __b) {
+  return __a + __b;
+}
+#endif // __VSX__
+
+/* vec_adde */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_adde(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+#endif
+
+/* vec_addec */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_addec(vector signed __int128 __a, vector signed __int128 __b,
+          vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+          vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+#endif
+
 /* vec_vaddubm */
 
 #define __builtin_altivec_vaddubm vec_vaddubm
@@ -353,6 +422,12 @@
 
 /* vec_addc */
 
+static vector signed int __ATTRS_o_ai vec_addc(vector signed int __a,
+                                               vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vaddcuw((vector unsigned int)__a,
+                                                      (vector unsigned int)__b);
+}
+
 static vector unsigned int __ATTRS_o_ai vec_addc(vector unsigned int __a,
                                                  vector unsigned int __b) {
   return __builtin_altivec_vaddcuw(__a, __b);
@@ -361,7 +436,9 @@
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 static vector signed __int128 __ATTRS_o_ai
 vec_addc(vector signed __int128 __a, vector signed __int128 __b) {
-  return __builtin_altivec_vaddcuq(__a, __b);
+  return (vector signed __int128)__builtin_altivec_vaddcuq(
+    (vector unsigned __int128)__a,
+    (vector unsigned __int128)__b);
 }
 
 static vector unsigned __int128 __ATTRS_o_ai
@@ -746,6 +823,24 @@
 }
 
 #ifdef __VSX__
+static vector double __ATTRS_o_ai vec_and(vector bool long long __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_and(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_and(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
 static vector signed long long __ATTRS_o_ai
 vec_and(vector signed long long __a, vector signed long long __b) {
   return __a & __b;
@@ -1068,6 +1163,26 @@
 }
 
 #ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai
+vec_andc(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_andc(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
 static vector signed long long __ATTRS_o_ai
 vec_andc(vector signed long long __a, vector signed long long __b) {
   return __a & ~__b;
@@ -1338,11 +1453,20 @@
 
 /* vec_ceil */
 
-static vector float __attribute__((__always_inline__))
-vec_ceil(vector float __a) {
+static vector float __ATTRS_o_ai vec_ceil(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspip(__a);
+#else
   return __builtin_altivec_vrfip(__a);
+#endif
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_ceil(vector double __a) {
+  return __builtin_vsx_xvrdpip(__a);
+}
+#endif
+
 /* vec_vrfip */
 
 static vector float __attribute__((__always_inline__))
@@ -1414,22 +1538,20 @@
 
 static vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
                                               vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpeqsp(__a, __b);
+#else
   return (vector bool int)__builtin_altivec_vcmpeqfp(__a, __b);
+#endif
 }
 
-/* vec_cmpge */
-
-static vector bool int __attribute__((__always_inline__))
-vec_cmpge(vector float __a, vector float __b) {
-  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpeqdp(__a, __b);
 }
+#endif
 
-/* vec_vcmpgefp */
-
-static vector bool int __attribute__((__always_inline__))
-vec_vcmpgefp(vector float __a, vector float __b) {
-  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
-}
 
 /* vec_cmpgt */
 
@@ -1476,7 +1598,85 @@
 
 static vector bool int __ATTRS_o_ai vec_cmpgt(vector float __a,
                                               vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgtsp(__a, __b);
+#else
   return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgtdp(__a, __b);
+}
+#endif
+
+/* vec_cmpge */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpge (vector signed char __a, vector signed char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpge (vector unsigned char __a, vector unsigned char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpge (vector signed short __a, vector signed short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpge (vector unsigned short __a, vector unsigned short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge (vector signed int __a, vector signed int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge (vector unsigned int __a, vector unsigned int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgesp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgedp(__a, __b);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+#endif
+
+/* vec_vcmpgefp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgefp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
 }
 
 /* vec_vcmpgtsb */
@@ -1530,47 +1730,157 @@
 
 /* vec_cmple */
 
-static vector bool int __attribute__((__always_inline__))
-vec_cmple(vector float __a, vector float __b) {
-  return (vector bool int)__builtin_altivec_vcmpgefp(__b, __a);
+static vector bool char __ATTRS_o_ai
+vec_cmple (vector signed char __a, vector signed char __b) {
+  return vec_cmpge(__b, __a);
 }
 
+static vector bool char __ATTRS_o_ai
+vec_cmple (vector unsigned char __a, vector unsigned char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmple (vector signed short __a, vector signed short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmple (vector unsigned short __a, vector unsigned short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple (vector signed int __a, vector signed int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple (vector unsigned int __a, vector unsigned int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple(vector float __a, vector float __b) {
+  return vec_cmpge(__b, __a);
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector double __a, vector double __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
 /* vec_cmplt */
 
 static vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a,
                                                vector signed char __b) {
-  return (vector bool char)__builtin_altivec_vcmpgtsb(__b, __a);
+  return vec_cmpgt(__b, __a);
 }
 
 static vector bool char __ATTRS_o_ai vec_cmplt(vector unsigned char __a,
                                                vector unsigned char __b) {
-  return (vector bool char)__builtin_altivec_vcmpgtub(__b, __a);
+  return vec_cmpgt(__b, __a);
 }
 
 static vector bool short __ATTRS_o_ai vec_cmplt(vector short __a,
                                                 vector short __b) {
-  return (vector bool short)__builtin_altivec_vcmpgtsh(__b, __a);
+  return vec_cmpgt(__b, __a);
 }
 
 static vector bool short __ATTRS_o_ai vec_cmplt(vector unsigned short __a,
                                                 vector unsigned short __b) {
-  return (vector bool short)__builtin_altivec_vcmpgtuh(__b, __a);
+  return vec_cmpgt(__b, __a);
 }
 
 static vector bool int __ATTRS_o_ai vec_cmplt(vector int __a, vector int __b) {
-  return (vector bool int)__builtin_altivec_vcmpgtsw(__b, __a);
+  return vec_cmpgt(__b, __a);
 }
 
 static vector bool int __ATTRS_o_ai vec_cmplt(vector unsigned int __a,
                                               vector unsigned int __b) {
-  return (vector bool int)__builtin_altivec_vcmpgtuw(__b, __a);
+  return vec_cmpgt(__b, __a);
 }
 
 static vector bool int __ATTRS_o_ai vec_cmplt(vector float __a,
                                               vector float __b) {
-  return (vector bool int)__builtin_altivec_vcmpgtfp(__b, __a);
+  return vec_cmpgt(__b, __a);
 }
 
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector double __a, vector double __b) {
+  return vec_cmpgt(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+/* vec_cntlz */
+
+static vector signed char __ATTRS_o_ai vec_cntlz(vector signed char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static vector unsigned char __ATTRS_o_ai vec_cntlz(vector unsigned char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static vector signed short __ATTRS_o_ai vec_cntlz(vector signed short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static vector unsigned short __ATTRS_o_ai vec_cntlz(vector unsigned short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static vector signed int __ATTRS_o_ai vec_cntlz(vector signed int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static vector unsigned int __ATTRS_o_ai vec_cntlz(vector unsigned int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static vector signed long long __ATTRS_o_ai
+vec_cntlz(vector signed long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+#endif
+
+/* vec_cpsgn */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai vec_cpsgn(vector float __a, vector float __b) {
+  return __builtin_vsx_xvcpsgnsp(__a, __b);
+}
+
+static vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
+                                            vector double __b) {
+  return __builtin_vsx_xvcpsgndp(__a, __b);
+}
+#endif
+
 /* vec_ctf */
 
 static vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
@@ -1623,15 +1933,73 @@
   return __builtin_altivec_vctuxs(__a, __b);
 }
 
-/* vec_div */
+/* vec_double */
+
 #ifdef __VSX__
+static vector double __ATTRS_o_ai vec_double (vector signed long long __a) {
+  vector double __ret = { __a[0], __a[1] };
+  return __ret;
+}
+
+static vector double __ATTRS_o_ai vec_double (vector unsigned long long __a) {
+  vector double __ret = { __a[0], __a[1] };
+  return __ret;
+}
+#endif
+
+/* vec_div */
+
+/* Integer vector divides (vectors are scalarized, elements divided
+   and the vectors reassembled).
+*/
+static vector signed char __ATTRS_o_ai vec_div(vector signed char __a,
+                                               vector signed char __b) {
+  return __a / __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_div(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a / __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_div(vector signed short __a,
+                                                vector signed short __b) {
+  return __a / __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_div(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a / __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_div(vector signed int __a,
+                                              vector signed int __b) {
+  return __a / __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_div(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a / __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_div(vector signed long long __a, vector signed long long __b) {
+  return __a / __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_div(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a / __b;
+}
+
 static vector float __ATTRS_o_ai vec_div(vector float __a, vector float __b) {
-  return __builtin_vsx_xvdivsp(__a, __b);
+  return __a / __b;
 }
 
 static vector double __ATTRS_o_ai vec_div(vector double __a,
                                           vector double __b) {
-  return __builtin_vsx_xvdivdp(__a, __b);
+  return __a / __b;
 }
 #endif
 
@@ -1675,6 +2043,92 @@
   __builtin_altivec_dstt(__a, __b, __c);
 }
 
+/* vec_eqv */
+
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_eqv(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                  (vector unsigned int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_eqv(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                    (vector unsigned int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_eqv(vector bool char __a,
+                                             vector bool char __b) {
+  return (vector bool char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                (vector unsigned int)__b);
+}
+
+static vector signed short __ATTRS_o_ai vec_eqv(vector signed short __a,
+                                                vector signed short __b) {
+  return (vector signed short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                   (vector unsigned int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_eqv(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                     (vector unsigned int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_eqv(vector bool short __a,
+                                              vector bool short __b) {
+  return (vector bool short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector signed int __ATTRS_o_ai vec_eqv(vector signed int __a,
+                                              vector signed int __b) {
+  return (vector signed int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_eqv(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_vsx_xxleqv(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_eqv(vector bool int __a,
+                                            vector bool int __b) {
+  return (vector bool int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_eqv(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_eqv(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_eqv(vector float __a, vector float __b) {
+  return (vector float)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                            (vector unsigned int)__b);
+}
+
+static vector double __ATTRS_o_ai vec_eqv(vector double __a,
+                                          vector double __b) {
+  return (vector double)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                             (vector unsigned int)__b);
+}
+#endif
+
 /* vec_expte */
 
 static vector float __attribute__((__always_inline__))
@@ -1691,11 +2145,20 @@
 
 /* vec_floor */
 
-static vector float __attribute__((__always_inline__))
-vec_floor(vector float __a) {
+static vector float __ATTRS_o_ai vec_floor(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspim(__a);
+#else
   return __builtin_altivec_vrfim(__a);
+#endif
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_floor(vector double __a) {
+  return __builtin_vsx_xvrdpim(__a);
+}
+#endif
+
 /* vec_vrfim */
 
 static vector float __attribute__((__always_inline__))
@@ -2365,12 +2828,55 @@
 #endif
 
 /* vec_madd */
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector signed short, vector signed short);
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector unsigned short, vector unsigned short);
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector signed short, vector signed short);
+static vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector unsigned short, vector unsigned short);
 
-static vector float __attribute__((__always_inline__))
-vec_madd(vector float __a, vector float __b, vector float __c) {
-  return __builtin_altivec_vmaddfp(__a, __b, __c);
+static vector signed short __ATTRS_o_ai
+vec_madd(vector signed short __a, vector signed short __b,
+         vector signed short __c) {
+  return  vec_mladd(__a, __b, __c);
 }
 
+static vector signed short __ATTRS_o_ai
+vec_madd(vector signed short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector signed short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector signed short __b,
+         vector signed short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_madd(vector float __a, vector float __b, vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaddasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_madd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvmaddadp(__a, __b, __c);
+}
+#endif
+
 /* vec_vmaddfp */
 
 static vector float __attribute__((__always_inline__))
@@ -2393,6 +2899,20 @@
   return __builtin_altivec_vmhaddshs(__a, __b, __c);
 }
 
+/* vec_msub */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai
+vec_msub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_vsx_xvmsubasp(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai
+vec_msub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvmsubadp(__a, __b, __c);
+}
+#endif
+
 /* vec_max */
 
 static vector signed char __ATTRS_o_ai vec_max(vector signed char __a,
@@ -2727,6 +3247,96 @@
                                          0x14, 0x15, 0x16, 0x17));
 }
 
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                 (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                        0x04, 0x05, 0x06, 0x07,
+                                        0x10, 0x11, 0x12, 0x13,
+                                        0x14, 0x15, 0x16, 0x17));
+}
+
+static vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                             vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                             vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static vector double __ATTRS_o_ai vec_mergeh(vector bool long long __a,
+                                             vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+#endif
+
 /* vec_vmrghb */
 
 #define __builtin_altivec_vmrghb vec_vmrghb
@@ -2915,6 +3525,89 @@
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector bool long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector double __a, vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector double __a, vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+#endif
+
 /* vec_vmrglb */
 
 #define __builtin_altivec_vmrglb vec_vmrglb
@@ -3014,6 +3707,56 @@
                                          0x1C, 0x1D, 0x1E, 0x1F));
 }
 
+
+#ifdef __POWER8_VECTOR__
+/* vec_mergee */
+
+static vector bool int __ATTRS_o_ai
+vec_mergee(vector bool int __a, vector bool int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+static vector signed int __ATTRS_o_ai
+vec_mergee(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergee(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+/* vec_mergeo */
+
+static vector bool int  __ATTRS_o_ai
+vec_mergeo(vector bool int __a, vector bool int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector signed int  __ATTRS_o_ai
+vec_mergeo(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int  __ATTRS_o_ai
+vec_mergeo(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#endif
+
 /* vec_mfvscr */
 
 static vector unsigned short __attribute__((__always_inline__))
@@ -3461,6 +4204,65 @@
   __builtin_altivec_mtvscr((vector int)__a);
 }
 
+/* vec_mul */
+
+/* Integer vector multiplication will involve multiplication of the odd/even
+   elements separately, then truncating the results and moving to the
+   result vector.
+*/
+static vector signed char __ATTRS_o_ai vec_mul(vector signed char __a,
+                                               vector signed char __b) {
+  return __a * __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_mul(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a * __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_mul(vector signed short __a,
+                                                vector signed short __b) {
+  return __a * __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mul(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a * __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_mul(vector signed int __a,
+                                              vector signed int __b) {
+  return __a * __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mul(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mul(vector signed long long __a, vector signed long long __b) {
+  return __a * __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mul(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a * __b;
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_mul(vector float __a, vector float __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_mul(vector double __a, vector double __b) {
+  return __a * __b;
+}
+#endif
+
 /* The vmulos* and vmules* instructions have a big endian bias, so
    we must reverse the meaning of "even" and "odd" for little endian.  */
 
@@ -3666,13 +4468,181 @@
 #endif
 }
 
+/*  vec_nand */
+
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_nand(vector signed char __a,
+                                                vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_nand(vector signed char __a,
+                                                vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return ~(__a & __b);
+
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                              vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector signed short __a,
+                                                 vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector signed short __a,
+                                                 vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector bool short __a,
+                                                 vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_nand(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_nand(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return ~(__a & __b);
+
+}
+
+static vector bool short __ATTRS_o_ai vec_nand(vector bool short __a,
+                                               vector bool short __b) {
+  return ~(__a & __b);
+
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                               vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                               vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                               vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                             vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+#endif
+
+/* vec_nmadd */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai
+vec_nmadd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_vsx_xvnmaddasp(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai
+vec_nmadd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvnmaddadp(__a, __b, __c);
+}
+#endif
+
 /* vec_nmsub */
 
-static vector float __attribute__((__always_inline__))
+static vector float __ATTRS_o_ai
 vec_nmsub(vector float __a, vector float __b, vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvnmsubasp(__a, __b, __c);
+#else
   return __builtin_altivec_vnmsubfp(__a, __b, __c);
+#endif
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_nmsub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvnmsubadp(__a, __b, __c);
+}
+#endif
+
 /* vec_vnmsubfp */
 
 static vector float __attribute__((__always_inline__))
@@ -3733,6 +4703,15 @@
   return (vector float)__res;
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_nor(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      ~((vector unsigned long long)__a | (vector unsigned long long)__b);
+  return (vector double)__res;
+}
+#endif
+
 /* vec_vnor */
 
 static vector signed char __ATTRS_o_ai vec_vnor(vector signed char __a,
@@ -3925,6 +4904,22 @@
 }
 
 #ifdef __VSX__
+static vector double __ATTRS_o_ai vec_or(vector bool long long __a,
+                                         vector double __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static vector double __ATTRS_o_ai vec_or(vector double __a,
+                                         vector bool long long __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static vector double __ATTRS_o_ai vec_or(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
 static vector signed long long __ATTRS_o_ai
 vec_or(vector signed long long __a, vector signed long long __b) {
   return __a | __b;
@@ -3961,6 +4956,148 @@
 }
 #endif
 
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_orc(vector signed char __a,
+                                               vector signed char __b) {
+  return __a | ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_orc(vector signed char __a,
+                                               vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                               vector signed char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                             vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector signed short __a,
+                                                vector signed short __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector signed short __a,
+                                                vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector bool short __a,
+                                                vector signed short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_orc(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_orc(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_orc(vector bool short __a,
+                                              vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                              vector signed int __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                              vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                              vector signed int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                                vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                            vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_orc(vector signed long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_orc(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+#endif
+
 /* vec_vor */
 
 static vector signed char __ATTRS_o_ai vec_vor(vector signed char __a,
@@ -4215,6 +5352,53 @@
 #endif
 }
 
+#ifdef __VSX__
+static vector signed int __ATTRS_o_ai vec_pack(vector signed long long __a,
+                                               vector signed long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+static vector unsigned int __ATTRS_o_ai
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_pack(vector bool long long __a,
+                                             vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+#endif
+
 /* vec_vpkuhum */
 
 #define __builtin_altivec_vpkuhum vec_vpkuhum
@@ -4679,17 +5863,18 @@
 #endif
 }
 
-static vector short __ATTRS_o_ai vec_perm(vector short __a, vector short __b,
+static vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                          vector signed short __b,
                                           vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
   __d = vec_xor(__c, __d);
-  return (vector short)__builtin_altivec_vperm_4si((vector int)__b,
-                                                   (vector int)__a, __d);
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
 #else
-  return (vector short)__builtin_altivec_vperm_4si((vector int)__a,
-                                                   (vector int)__b, __c);
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
 #endif
 }
 
@@ -4737,15 +5922,16 @@
 #endif
 }
 
-static vector int __ATTRS_o_ai vec_perm(vector int __a, vector int __b,
+static vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                        vector signed int __b,
                                         vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
   __d = vec_xor(__c, __d);
-  return (vector int)__builtin_altivec_vperm_4si(__b, __a, __d);
+  return (vector signed int)__builtin_altivec_vperm_4si(__b, __a, __d);
 #else
-  return (vector int)__builtin_altivec_vperm_4si(__a, __b, __c);
+  return (vector signed int)__builtin_altivec_vperm_4si(__a, __b, __c);
 #endif
 }
 
@@ -4794,16 +5980,18 @@
 }
 
 #ifdef __VSX__
-static vector long long __ATTRS_o_ai vec_perm(vector long long __a,
-                                              vector long long __b,
+static vector long long __ATTRS_o_ai vec_perm(vector signed long long __a,
+                                              vector signed long long __b,
                                               vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
   vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
                               255, 255, 255, 255, 255, 255, 255, 255};
   __d = vec_xor(__c, __d);
-  return (vector long long)__builtin_altivec_vperm_4si(__b, __a, __d);
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
 #else
-  return (vector long long)__builtin_altivec_vperm_4si(__a, __b, __c);
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
 #endif
 }
 
@@ -4822,6 +6010,21 @@
 #endif
 }
 
+static vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
 static vector double __ATTRS_o_ai vec_perm(vector double __a, vector double __b,
                                            vector unsigned char __c) {
 #ifdef __LITTLE_ENDIAN__
@@ -4923,11 +6126,21 @@
 
 /* vec_re */
 
-static vector float __attribute__((__always_inline__))
+static vector float __ATTRS_o_ai
 vec_re(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvresp(__a);
+#else
   return __builtin_altivec_vrefp(__a);
+#endif
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_re(vector double __a) {
+  return __builtin_vsx_xvredp(__a);
+}
+#endif
+
 /* vec_vrefp */
 
 static vector float __attribute__((__always_inline__))
@@ -5016,11 +6229,42 @@
 
 /* vec_round */
 
-static vector float __attribute__((__always_inline__))
-vec_round(vector float __a) {
+static vector float __ATTRS_o_ai vec_round(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspi(__a);
+#else
   return __builtin_altivec_vrfin(__a);
+#endif
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_round(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+
+/* vec_rint */
+
+static vector float __ATTRS_o_ai
+vec_rint(vector float __a) {
+  return __builtin_vsx_xvrspic(__a);
+}
+
+static vector double __ATTRS_o_ai
+vec_rint(vector double __a) {
+  return __builtin_vsx_xvrdpic(__a);
+}
+
+/* vec_nearbyint */
+
+static vector float __ATTRS_o_ai vec_nearbyint(vector float __a) {
+  return __builtin_vsx_xvrspi(__a);
+}
+
+static vector double __ATTRS_o_ai vec_nearbyint(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+#endif
+
 /* vec_vrfin */
 
 static vector float __attribute__((__always_inline__))
@@ -5028,13 +6272,35 @@
   return __builtin_altivec_vrfin(__a);
 }
 
+/* vec_sqrt */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai vec_sqrt(vector float __a) {
+  return __builtin_vsx_xvsqrtsp(__a);
+}
+
+static vector double __ATTRS_o_ai vec_sqrt(vector double __a) {
+  return __builtin_vsx_xvsqrtdp(__a);
+}
+#endif
+
 /* vec_rsqrte */
 
-static __vector float __attribute__((__always_inline__))
+static vector float __ATTRS_o_ai
 vec_rsqrte(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrsqrtesp(__a);
+#else
   return __builtin_altivec_vrsqrtefp(__a);
+#endif
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_rsqrte(vector double __a) {
+  return __builtin_vsx_xvrsqrtedp(__a);
+}
+#endif
+
 /* vec_vrsqrtefp */
 
 static __vector float __attribute__((__always_inline__))
@@ -5165,6 +6431,22 @@
   return (vector float)__res;
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_sel(vector double __a, vector double __b,
+                                          vector bool long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                     ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_sel(vector double __a, vector double __b,
+                                          vector unsigned long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                     ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+#endif
+
 /* vec_vsel */
 
 static vector signed char __ATTRS_o_ai vec_vsel(vector signed char __a,
@@ -5377,78 +6659,220 @@
 
 static vector signed char __ATTRS_o_ai vec_sld(vector signed char __a,
                                                vector signed char __b,
-                                               unsigned char __c) {
+                                               unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector unsigned char __ATTRS_o_ai vec_sld(vector unsigned char __a,
                                                  vector unsigned char __b,
-                                                 unsigned char __c) {
+                                                 unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
-static vector short __ATTRS_o_ai vec_sld(vector short __a, vector short __b,
-                                         unsigned char __c) {
+static vector bool char __ATTRS_o_ai vec_sld(vector bool char __a,
+                                             vector bool char __b,
+                                             unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector signed short __ATTRS_o_ai vec_sld(vector signed short __a,
+                                                vector signed short __b,
+                                                unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector unsigned short __ATTRS_o_ai vec_sld(vector unsigned short __a,
                                                   vector unsigned short __b,
-                                                  unsigned char __c) {
+                                                  unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_sld(vector bool short __a,
+                                              vector bool short __b,
+                                              unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector pixel __ATTRS_o_ai vec_sld(vector pixel __a, vector pixel __b,
-                                         unsigned char __c) {
+                                         unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
-static vector int __ATTRS_o_ai vec_sld(vector int __a, vector int __b,
-                                       unsigned char __c) {
+static vector signed int __ATTRS_o_ai vec_sld(vector signed int __a,
+                                              vector signed int __b,
+                                              unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector unsigned int __ATTRS_o_ai vec_sld(vector unsigned int __a,
                                                 vector unsigned int __b,
-                                                unsigned char __c) {
+                                                unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_sld(vector bool int __a,
+                                            vector bool int __b,
+                                            unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector float __ATTRS_o_ai vec_sld(vector float __a, vector float __b,
-                                         unsigned char __c) {
+                                         unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 /* vec_vsldoi */
@@ -5456,77 +6880,157 @@
 static vector signed char __ATTRS_o_ai vec_vsldoi(vector signed char __a,
                                                   vector signed char __b,
                                                   unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector unsigned char __ATTRS_o_ai vec_vsldoi(vector unsigned char __a,
                                                     vector unsigned char __b,
                                                     unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector short __ATTRS_o_ai vec_vsldoi(vector short __a, vector short __b,
                                             unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector unsigned short __ATTRS_o_ai vec_vsldoi(vector unsigned short __a,
                                                      vector unsigned short __b,
                                                      unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector pixel __ATTRS_o_ai vec_vsldoi(vector pixel __a, vector pixel __b,
                                             unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector int __ATTRS_o_ai vec_vsldoi(vector int __a, vector int __b,
                                           unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector unsigned int __ATTRS_o_ai vec_vsldoi(vector unsigned int __a,
                                                    vector unsigned int __b,
                                                    unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 static vector float __ATTRS_o_ai vec_vsldoi(vector float __a, vector float __b,
                                             unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
   return vec_perm(
       __a, __b,
-      (vector unsigned char)(__c, __c + 1, __c + 2, __c + 3, __c + 4, __c + 5,
-                             __c + 6, __c + 7, __c + 8, __c + 9, __c + 10,
-                             __c + 11, __c + 12, __c + 13, __c + 14, __c + 15));
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
 }
 
 /* vec_sll */
@@ -6057,91 +7561,131 @@
 /* vec_splat */
 
 static vector signed char __ATTRS_o_ai vec_splat(vector signed char __a,
-                                                 unsigned char __b) {
-  return vec_perm(__a, __a, (vector unsigned char)(__b));
+                                                 unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
 }
 
 static vector unsigned char __ATTRS_o_ai vec_splat(vector unsigned char __a,
-                                                   unsigned char __b) {
-  return vec_perm(__a, __a, (vector unsigned char)(__b));
+                                                   unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
 }
 
 static vector bool char __ATTRS_o_ai vec_splat(vector bool char __a,
-                                               unsigned char __b) {
-  return vec_perm(__a, __a, (vector unsigned char)(__b));
+                                               unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
 }
 
-static vector short __ATTRS_o_ai vec_splat(vector short __a,
-                                           unsigned char __b) {
-  __b *= 2;
-  unsigned char b1 = __b + 1;
+static vector signed short __ATTRS_o_ai vec_splat(vector signed short __a,
+                                                  unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
-                                         __b, b1, __b, b1, __b, b1, __b, b1));
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
 }
 
 static vector unsigned short __ATTRS_o_ai vec_splat(vector unsigned short __a,
-                                                    unsigned char __b) {
-  __b *= 2;
-  unsigned char b1 = __b + 1;
+                                                    unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
-                                         __b, b1, __b, b1, __b, b1, __b, b1));
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
 }
 
 static vector bool short __ATTRS_o_ai vec_splat(vector bool short __a,
-                                                unsigned char __b) {
-  __b *= 2;
-  unsigned char b1 = __b + 1;
+                                                unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
-                                         __b, b1, __b, b1, __b, b1, __b, b1));
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
 }
 
 static vector pixel __ATTRS_o_ai vec_splat(vector pixel __a,
-                                           unsigned char __b) {
-  __b *= 2;
-  unsigned char b1 = __b + 1;
+                                           unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
-                                         __b, b1, __b, b1, __b, b1, __b, b1));
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
 }
 
-static vector int __ATTRS_o_ai vec_splat(vector int __a, unsigned char __b) {
-  __b *= 4;
-  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+static vector signed int __ATTRS_o_ai vec_splat(vector signed int __a,
+                                                unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
-                                         b1, b2, b3, __b, b1, b2, b3));
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
 }
 
 static vector unsigned int __ATTRS_o_ai vec_splat(vector unsigned int __a,
-                                                  unsigned char __b) {
-  __b *= 4;
-  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+                                                  unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
-                                         b1, b2, b3, __b, b1, b2, b3));
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
 }
 
 static vector bool int __ATTRS_o_ai vec_splat(vector bool int __a,
-                                              unsigned char __b) {
-  __b *= 4;
-  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+                                              unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
-                                         b1, b2, b3, __b, b1, b2, b3));
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
 }
 
 static vector float __ATTRS_o_ai vec_splat(vector float __a,
-                                           unsigned char __b) {
-  __b *= 4;
-  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+                                           unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
   return vec_perm(__a, __a,
-                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
-                                         b1, b2, b3, __b, b1, b2, b3));
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_splat(vector double __a,
+                                            unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector bool long long __ATTRS_o_ai vec_splat(vector bool long long __a,
+                                                    unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector signed long long __ATTRS_o_ai
+vec_splat(vector signed long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_splat(vector unsigned long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+#endif
+
 /* vec_vspltb */
 
 #define __builtin_altivec_vspltb vec_vspltb
@@ -6313,7 +7857,8 @@
 
 static vector signed char __ATTRS_o_ai vec_sr(vector signed char __a,
                                               vector unsigned char __b) {
-  return __a >> (vector signed char)__b;
+  vector unsigned char __res = (vector unsigned char)__a >> __b;
+  return (vector signed char)__res;
 }
 
 static vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a,
@@ -6321,9 +7866,10 @@
   return __a >> __b;
 }
 
-static vector short __ATTRS_o_ai vec_sr(vector short __a,
+static vector signed short __ATTRS_o_ai vec_sr(vector signed short __a,
                                         vector unsigned short __b) {
-  return __a >> (vector short)__b;
+  vector unsigned short __res = (vector unsigned short)__a >> __b;
+  return (vector signed short)__res;
 }
 
 static vector unsigned short __ATTRS_o_ai vec_sr(vector unsigned short __a,
@@ -6331,8 +7877,10 @@
   return __a >> __b;
 }
 
-static vector int __ATTRS_o_ai vec_sr(vector int __a, vector unsigned int __b) {
-  return __a >> (vector int)__b;
+static vector signed int __ATTRS_o_ai vec_sr(vector signed int __a,
+                                             vector unsigned int __b) {
+  vector unsigned int __res = (vector unsigned int)__a >> __b;
+  return (vector signed int)__res;
 }
 
 static vector unsigned int __ATTRS_o_ai vec_sr(vector unsigned int __a,
@@ -6343,7 +7891,8 @@
 #ifdef __POWER8_VECTOR__
 static vector signed long long __ATTRS_o_ai
 vec_sr(vector signed long long __a, vector unsigned long long __b) {
-  return __a >> (vector long long)__b;
+  vector unsigned long long __res = (vector unsigned long long)__a >> __b;
+  return (vector signed long long)__res;
 }
 
 static vector unsigned long long __ATTRS_o_ai
@@ -7740,6 +9289,23 @@
 }
 #endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_sub(vector signed long long __a, vector signed long long __b) {
+  return __a - __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sub(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a - __b;
+}
+
+static vector double __ATTRS_o_ai
+vec_sub(vector double __a, vector double __b) {
+  return __a - __b;
+}
+#endif
+
 static vector float __ATTRS_o_ai vec_sub(vector float __a, vector float __b) {
   return __a - __b;
 }
@@ -8235,11 +9801,21 @@
 
 /* vec_trunc */
 
-static vector float __attribute__((__always_inline__))
+static vector float __ATTRS_o_ai
 vec_trunc(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspiz(__a);
+#else
   return __builtin_altivec_vrfiz(__a);
+#endif
 }
 
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_trunc(vector double __a) {
+  return __builtin_vsx_xvrdpiz(__a);
+}
+#endif
+
 /* vec_vrfiz */
 
 static vector float __attribute__((__always_inline__))
@@ -8729,6 +10305,24 @@
                                                   vector bool long long __b) {
   return __a ^ __b;
 }
+
+static vector double __ATTRS_o_ai
+vec_xor(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                          (vector unsigned long long)__b);
+}
+
+static vector double __ATTRS_o_ai
+vec_xor(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long) __b);
+}
+
+static vector double __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
 #endif
 
 /* vec_vxor */
@@ -8904,7 +10498,12 @@
   return __a[__b];
 }
 
-static short __ATTRS_o_ai vec_extract(vector short __a, int __b) {
+static unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
+                                              int __b) {
+  return __a[__b];
+}
+
+static signed short __ATTRS_o_ai vec_extract(vector signed short __a, int __b) {
   return __a[__b];
 }
 
@@ -8913,7 +10512,12 @@
   return __a[__b];
 }
 
-static int __ATTRS_o_ai vec_extract(vector int __a, int __b) {
+static unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
+                                               int __b) {
+  return __a[__b];
+}
+
+static signed int __ATTRS_o_ai vec_extract(vector signed int __a, int __b) {
   return __a[__b];
 }
 
@@ -8921,6 +10525,31 @@
   return __a[__b];
 }
 
+static unsigned int __ATTRS_o_ai vec_extract(vector bool int __a, int __b) {
+  return __a[__b];
+}
+
+#ifdef __VSX__
+static signed long long __ATTRS_o_ai vec_extract(vector signed long long __a,
+                                                 int __b) {
+  return __a[__b];
+}
+
+static unsigned long long __ATTRS_o_ai
+vec_extract(vector unsigned long long __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned long long __ATTRS_o_ai vec_extract(vector bool long long __a,
+                                                   int __b) {
+  return __a[__b];
+}
+
+static double __ATTRS_o_ai vec_extract(vector double __a, int __b) {
+  return __a[__b];
+}
+#endif
+
 static float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
   return __a[__b];
 }
@@ -8941,8 +10570,16 @@
   return __b;
 }
 
-static vector short __ATTRS_o_ai vec_insert(short __a, vector short __b,
-                                            int __c) {
+static vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
+                                                vector bool char __b,
+                                                int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_insert(signed short __a,
+                                                   vector signed short __b,
+                                                   int __c) {
   __b[__c] = __a;
   return __b;
 }
@@ -8954,7 +10591,16 @@
   return __b;
 }
 
-static vector int __ATTRS_o_ai vec_insert(int __a, vector int __b, int __c) {
+static vector bool short __ATTRS_o_ai vec_insert(unsigned short __a,
+                                                 vector bool short __b,
+                                                 int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_insert(signed int __a,
+                                                 vector signed int __b,
+                                                 int __c) {
   __b[__c] = __a;
   return __b;
 }
@@ -8966,6 +10612,38 @@
   return __b;
 }
 
+static vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
+                                               vector bool int __b,
+                                               int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_insert(signed long long __a, vector signed long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+static vector double __ATTRS_o_ai vec_insert(double __a, vector double __b,
+                                             int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+#endif
+
 static vector float __ATTRS_o_ai vec_insert(float __a, vector float __b,
                                             int __c) {
   __b[__c] = __a;
@@ -9890,6 +11568,33 @@
   return (vector unsigned int)(__a);
 }
 
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai vec_splats(signed long long __a) {
+  return (vector signed long long)(__a);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_splats(unsigned long long __a) {
+  return (vector unsigned long long)(__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai vec_splats(signed __int128 __a) {
+  return (vector signed __int128)(__a);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_splats(unsigned __int128 __a) {
+  return (vector unsigned __int128)(__a);
+}
+
+#endif
+
+static vector double __ATTRS_o_ai vec_splats(double __a) {
+  return (vector double)(__a);
+}
+#endif
+
 static vector float __ATTRS_o_ai vec_splats(float __a) {
   return (vector float)(__a);
 }
@@ -10060,9 +11765,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_all_eq(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT, __a, __b);
+#else
   return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_eq(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
 /* vec_all_ge */
 
 static int __ATTRS_o_ai vec_all_ge(vector signed char __a,
@@ -10212,9 +11927,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_all_ge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __a, __b);
+#else
   return __builtin_altivec_vcmpgefp_p(__CR6_LT, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_ge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __a, __b);
+}
+#endif
+
 /* vec_all_gt */
 
 static int __ATTRS_o_ai vec_all_gt(vector signed char __a,
@@ -10364,9 +12089,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_all_gt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __a, __b);
+#else
   return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_gt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
 /* vec_all_in */
 
 static int __attribute__((__always_inline__))
@@ -10524,9 +12259,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_all_le(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __b, __a);
+#else
   return __builtin_altivec_vcmpgefp_p(__CR6_LT, __b, __a);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_le(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __b, __a);
+}
+#endif
+
 /* vec_all_lt */
 
 static int __ATTRS_o_ai vec_all_lt(vector signed char __a,
@@ -10677,15 +12422,35 @@
 #endif
 
 static int __ATTRS_o_ai vec_all_lt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __b, __a);
+#else
   return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __b, __a);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_lt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __b, __a);
+}
+#endif
+
 /* vec_all_nan */
 
-static int __attribute__((__always_inline__)) vec_all_nan(vector float __a) {
+static int __ATTRS_o_ai vec_all_nan(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ, __a, __a);
+#else
   return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __a);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_nan(vector double __a) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __a);
+}
+#endif
+
 /* vec_all_ne */
 
 static int __ATTRS_o_ai vec_all_ne(vector signed char __a,
@@ -10851,23 +12616,55 @@
 #endif
 
 static int __ATTRS_o_ai vec_all_ne(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+#else
   return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_ne(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
 /* vec_all_nge */
 
-static int __attribute__((__always_inline__))
+static int __ATTRS_o_ai
 vec_all_nge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ, __a, __b);
+#else
   return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai
+vec_all_nge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
 /* vec_all_ngt */
 
-static int __attribute__((__always_inline__))
+static int __ATTRS_o_ai
 vec_all_ngt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ, __a, __b);
+#else
   return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai
+vec_all_ngt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
 /* vec_all_nle */
 
 static int __attribute__((__always_inline__))
@@ -11054,9 +12851,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_any_eq(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ_REV, __a, __b);
+#else
   return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_eq(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
 /* vec_any_ge */
 
 static int __ATTRS_o_ai vec_any_ge(vector signed char __a,
@@ -11214,9 +13021,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_any_ge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __a, __b);
+#else
   return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_ge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
 /* vec_any_gt */
 
 static int __ATTRS_o_ai vec_any_gt(vector signed char __a,
@@ -11374,9 +13191,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_any_gt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __a, __b);
+#else
   return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_gt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
 /* vec_any_le */
 
 static int __ATTRS_o_ai vec_any_le(vector signed char __a,
@@ -11534,9 +13361,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_any_le(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __b, __a);
+#else
   return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __b, __a);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_le(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
 /* vec_any_lt */
 
 static int __ATTRS_o_ai vec_any_lt(vector signed char __a,
@@ -11694,9 +13531,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_any_lt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __b, __a);
+#else
   return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __b, __a);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_lt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
 /* vec_any_nan */
 
 static int __attribute__((__always_inline__)) vec_any_nan(vector float __a) {
@@ -11868,9 +13715,19 @@
 #endif
 
 static int __ATTRS_o_ai vec_any_ne(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT_REV, __a, __b);
+#else
   return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __b);
+#endif
 }
 
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_ne(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT_REV, __a, __b);
+}
+#endif
+
 /* vec_any_nge */
 
 static int __attribute__((__always_inline__))
@@ -11925,11 +13782,14 @@
 The remaining ones (currently controlled by -mcrypto for GCC) still
 need to be provided on compliant hardware even if Vector.Crypto is not
 provided.
-FIXME: the naming convention for the builtins will be adjusted due
-to the inconsistency (__builtin_crypto_ prefix on builtins that cannot be
-removed with -mno-crypto). This is under development.
 */
 #ifdef __CRYPTO__
+#define vec_sbox_be __builtin_altivec_crypto_vsbox
+#define vec_cipher_be __builtin_altivec_crypto_vcipher
+#define vec_cipherlast_be __builtin_altivec_crypto_vcipherlast
+#define vec_ncipher_be __builtin_altivec_crypto_vncipher
+#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
+
 static vector unsigned long long __attribute__((__always_inline__))
 __builtin_crypto_vsbox(vector unsigned long long __a) {
   return __builtin_altivec_crypto_vsbox(__a);
@@ -11961,6 +13821,11 @@
 
 #define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
 #define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
+
+#define vec_shasigma_be(X, Y, Z) \
+  _Generic((X), vector unsigned int: __builtin_crypto_vshasigmaw, \
+                vector unsigned long long: __builtin_crypto_vshasigmad) \
+((X), (Y), (Z))
 #endif
 
 #ifdef __POWER8_VECTOR__
@@ -12008,8 +13873,9 @@
   return __builtin_altivec_crypto_vpmsumw(__a, __b);
 }
 
-static vector unsigned long long __ATTRS_o_ai __builtin_crypto_vpmsumb(
-    vector unsigned long long __a, vector unsigned long long __b) {
+static vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned long long __a,
+                         vector unsigned long long __b) {
   return __builtin_altivec_crypto_vpmsumd(__a, __b);
 }
 
@@ -12018,6 +13884,9 @@
   return __builtin_altivec_vgbbd((vector unsigned char) __a);
 }
 
+#define vec_pmsum_be __builtin_crypto_vpmsumb
+#define vec_gb __builtin_altivec_vgbbd
+
 static vector unsigned char __ATTRS_o_ai vec_vgbbd (vector unsigned char __a)
 {
   return __builtin_altivec_vgbbd(__a);
@@ -12035,6 +13904,14 @@
 {
   return __builtin_altivec_vbpermq(__a, __b);
 }
+
+#ifdef __powerpc64__
+static vector unsigned long long __attribute__((__always_inline__))
+vec_bperm (vector unsigned __int128 __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char) __a,
+                                   (vector unsigned char) __b);
+}
+#endif
 #endif
 
 #undef __ATTRS_o_ai
diff --git a/renderscript/clang-include/ammintrin.h b/renderscript/clang-include/ammintrin.h
index 17f5ab1..4880fd7 100644
--- a/renderscript/clang-include/ammintrin.h
+++ b/renderscript/clang-include/ammintrin.h
@@ -24,24 +24,23 @@
 #ifndef __AMMINTRIN_H
 #define __AMMINTRIN_H
 
-#ifndef __SSE4A__
-#error "SSE4A instruction set not enabled"
-#else
-
 #include <pmmintrin.h>
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
+
 /// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
 ///    integer vector operand at the index idx and of the length len.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code 
+/// \code
 /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
-/// \endcode 
+/// \endcode
 ///
-/// \code                                                    
+/// \code
 /// This intrinsic corresponds to the \c EXTRQ instruction.
-/// \endcode 
+/// \endcode
 ///
 /// \param x
 ///    The value from which bits are extracted.
@@ -49,10 +48,10 @@
 ///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
 ///    are zero, the length is interpreted as 64.
 /// \param idx
-///    Bits [5:0] specify the index of the least significant bit; the other 
-///    bits are ignored. If the sum of the index and length is greater than 
-///    64, the result is undefined. If the length and index are both zero, 
-///    bits [63:0] of parameter x are extracted. If the length is zero 
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than
+///    64, the result is undefined. If the length and index are both zero,
+///    bits [63:0] of parameter x are extracted. If the length is zero
 ///    but the index is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector whose lower 64 bits contain the bits
 ///    extracted from the source operand.
@@ -65,62 +64,62 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code 
+/// \code
 /// This intrinsic corresponds to the \c EXTRQ instruction.
-/// \endcode 
+/// \endcode
 ///
 /// \param __x
 ///    The value from which bits are extracted.
 /// \param __y
-///    Specifies the index of the least significant bit at [13:8] 
-///    and the length at [5:0]; all other bits are ignored. 
+///    Specifies the index of the least significant bit at [13:8]
+///    and the length at [5:0]; all other bits are ignored.
 ///    If bits [5:0] are zero, the length is interpreted as 64.
-///    If the sum of the index and length is greater than 64, the result is 
-///    undefined. If the length and index are both zero, bits [63:0] of 
-///    parameter __x are extracted. If the length is zero but the index is 
-///    non-zero, the result is undefined. 
-/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted 
+///    If the sum of the index and length is greater than 64, the result is
+///    undefined. If the length and index are both zero, bits [63:0] of
+///    parameter __x are extracted. If the length is zero but the index is
+///    non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
 ///    from the source operand.
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_extract_si64(__m128i __x, __m128i __y)
 {
   return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
 }
 
-/// \brief Inserts bits of a specified length from the source integer vector 
-///    y into the lower 64 bits of the destination integer vector x at the 
+/// \brief Inserts bits of a specified length from the source integer vector
+///    y into the lower 64 bits of the destination integer vector x at the
 ///    index idx and of the length len.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code 
+/// \code
 /// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
 /// const int idx);
-/// \endcode 
+/// \endcode
 ///
-/// \code 
+/// \code
 /// This intrinsic corresponds to the \c INSERTQ instruction.
-/// \endcode 
+/// \endcode
 ///
 /// \param x
-///    The destination operand where bits will be inserted. The inserted bits 
-///    are defined by the length len and by the index idx specifying the least 
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length len and by the index idx specifying the least
 ///    significant bit.
 /// \param y
-///    The source operand containing the bits to be extracted. The extracted 
+///    The source operand containing the bits to be extracted. The extracted
 ///    bits are the least significant bits of operand y of length len.
 /// \param len
 ///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
 ///    are zero, the length is interpreted as 64.
 /// \param idx
-///    Bits [5:0] specify the index of the least significant bit; the other 
-///    bits are ignored. If the sum of the index and length is greater than 
-///    64, the result is undefined. If the length and index are both zero, 
-///    bits [63:0] of parameter y are inserted into parameter x. If the 
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than
+///    64, the result is undefined. If the length and index are both zero,
+///    bits [63:0] of parameter y are inserted into parameter x. If the
 ///    length is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits 
+/// \returns A 128-bit integer vector containing the original lower 64-bits
 ///    of destination operand x with the specified bitfields replaced by the
-///    lower bits of source operand y. The upper 64 bits of the return value 
+///    lower bits of source operand y. The upper 64 bits of the return value
 ///    are undefined.
 
 #define _mm_inserti_si64(x, y, len, idx) \
@@ -128,57 +127,57 @@
                                     (__v2di)(__m128i)(y), \
                                     (char)(len), (char)(idx)))
 
-/// \brief Inserts bits of a specified length from the source integer vector 
-///    __y into the lower 64 bits of the destination integer vector __x at 
+/// \brief Inserts bits of a specified length from the source integer vector
+///    __y into the lower 64 bits of the destination integer vector __x at
 ///    the index and of the length specified by __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code 
+/// \code
 /// This intrinsic corresponds to the \c INSERTQ instruction.
-/// \endcode 
+/// \endcode
 ///
 /// \param __x
-///    The destination operand where bits will be inserted. The inserted bits 
-///    are defined by the length and by the index of the least significant bit 
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length and by the index of the least significant bit
 ///    specified by operand __y.
 /// \param __y
-///    The source operand containing the bits to be extracted. The extracted 
+///    The source operand containing the bits to be extracted. The extracted
 ///    bits are the least significant bits of operand __y with length specified
-///    by bits [69:64]. These are inserted into the destination at the index 
+///    by bits [69:64]. These are inserted into the destination at the index
 ///    specified by bits [77:72]; all other bits are ignored.
 ///    If bits [69:64] are zero, the length is interpreted as 64.
-///    If the sum of the index and length is greater than 64, the result is 
-///    undefined. If the length and index are both zero, bits [63:0] of 
+///    If the sum of the index and length is greater than 64, the result is
+///    undefined. If the length and index are both zero, bits [63:0] of
 ///    parameter __y are inserted into parameter __x. If the length
-///    is zero but the index is non-zero, the result is undefined. 
-/// \returns A 128-bit integer vector containing the original lower 64-bits 
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits
 ///    of destination operand __x with the specified bitfields replaced by the
-///    lower bits of source operand __y. The upper 64 bits of the return value 
+///    lower bits of source operand __y. The upper 64 bits of the return value
 ///    are undefined.
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_si64(__m128i __x, __m128i __y)
 {
   return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
 }
 
-/// \brief Stores a 64-bit double-precision value in a 64-bit memory location. 
+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code 
+/// \code
 /// This intrinsic corresponds to the \c MOVNTSD instruction.
-/// \endcode 
+/// \endcode
 ///
 /// \param __p
 ///    The 64-bit memory location used to store the register value.
 /// \param __a
 ///    The 64-bit double-precision floating-point register value to
 ///    be stored.
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_sd(double *__p, __m128d __a)
 {
   __builtin_ia32_movntsd(__p, (__v2df)__a);
@@ -190,21 +189,21 @@
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// \code 
+/// \code
 /// This intrinsic corresponds to the \c MOVNTSS instruction.
-/// \endcode 
+/// \endcode
 ///
 /// \param __p
 ///    The 32-bit memory location used to store the register value.
 /// \param __a
 ///    The 32-bit single-precision floating-point register value to
 ///    be stored.
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_ss(float *__p, __m128 __a)
 {
   __builtin_ia32_movntss(__p, (__v4sf)__a);
 }
 
-#endif /* __SSE4A__ */
+#undef __DEFAULT_FN_ATTRS
 
 #endif /* __AMMINTRIN_H */
diff --git a/renderscript/clang-include/arm_acle.h b/renderscript/clang-include/arm_acle.h
index 73a7e76..4be1d09 100644
--- a/renderscript/clang-include/arm_acle.h
+++ b/renderscript/clang-include/arm_acle.h
@@ -175,14 +175,18 @@
   return __ror(__rev(t), 16);
 }
 
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-  __rev16l(unsigned long t) {
-    return __rorl(__revl(t), sizeof(long) / 2);
-}
-
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
   __rev16ll(uint64_t t) {
-  return __rorll(__revll(t), 32);
+  return (((uint64_t)__rev16(t >> 32)) << 32) | __rev16(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rev16l(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+    return __rev16(t);
+#else
+    return __rev16ll(t);
+#endif
 }
 
 /* REVSH */
diff --git a/renderscript/clang-include/avx2intrin.h b/renderscript/clang-include/avx2intrin.h
index e1e639d..f786572 100644
--- a/renderscript/clang-include/avx2intrin.h
+++ b/renderscript/clang-include/avx2intrin.h
@@ -28,149 +28,150 @@
 #ifndef __AVX2INTRIN_H
 #define __AVX2INTRIN_H
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2")))
+
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 #define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M))
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_abs_epi8(__m256i __a)
 {
     return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_abs_epi16(__m256i __a)
 {
     return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_abs_epi32(__m256i __a)
 {
     return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_packs_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_packs_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_packus_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
 {
   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_add_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v32qi)__a + (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_add_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a + (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_add_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a + (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_add_epi64(__m256i __a, __m256i __b)
 {
   return __a + __b;
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_adds_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_adds_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_adds_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_adds_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
 }
 
-#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \
-  __m256i __a = (a); \
-  __m256i __b = (b); \
-  (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); })
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({        \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+                                     (__v32qi)(__m256i)(b), (n)); })
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_and_si256(__m256i __a, __m256i __b)
 {
   return __a & __b;
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_andnot_si256(__m256i __a, __m256i __b)
 {
   return ~__a & __b;
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_avg_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_avg_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 {
   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
                                               (__v32qi)__M);
 }
 
-#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \
-  __m256i __V1 = (V1); \
-  __m256i __V2 = (V2); \
-  (__m256i)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \
-                                   (((M) & 0x01) ? 16 : 0), \
-                                   (((M) & 0x02) ? 17 : 1), \
-                                   (((M) & 0x04) ? 18 : 2), \
-                                   (((M) & 0x08) ? 19 : 3), \
-                                   (((M) & 0x10) ? 20 : 4), \
-                                   (((M) & 0x20) ? 21 : 5), \
-                                   (((M) & 0x40) ? 22 : 6), \
-                                   (((M) & 0x80) ? 23 : 7), \
-                                   (((M) & 0x01) ? 24 : 8), \
-                                   (((M) & 0x02) ? 25 : 9), \
+#define _mm256_blend_epi16(V1, V2, M) __extension__ ({       \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1),   \
+                                   (__v16hi)(__m256i)(V2),   \
+                                   (((M) & 0x01) ? 16 : 0),  \
+                                   (((M) & 0x02) ? 17 : 1),  \
+                                   (((M) & 0x04) ? 18 : 2),  \
+                                   (((M) & 0x08) ? 19 : 3),  \
+                                   (((M) & 0x10) ? 20 : 4),  \
+                                   (((M) & 0x20) ? 21 : 5),  \
+                                   (((M) & 0x40) ? 22 : 6),  \
+                                   (((M) & 0x80) ? 23 : 7),  \
+                                   (((M) & 0x01) ? 24 : 8),  \
+                                   (((M) & 0x02) ? 25 : 9),  \
                                    (((M) & 0x04) ? 26 : 10), \
                                    (((M) & 0x08) ? 27 : 11), \
                                    (((M) & 0x10) ? 28 : 12), \
@@ -178,315 +179,317 @@
                                    (((M) & 0x40) ? 30 : 14), \
                                    (((M) & 0x80) ? 31 : 15)); })
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v32qi)__a == (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a == (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a == (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)(__a == __b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)((__v32qi)__a > (__v32qi)__b);
+  /* This function always performs a signed comparison, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m256i)((__v32qs)__a > (__v32qs)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a > (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a > (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)(__a > __b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_hadd_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_hadd_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_hadds_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_hsub_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_hsub_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_madd_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_max_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_max_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_max_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_max_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_max_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_max_epu32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_min_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_min_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_min_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_min_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_min_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_min_epu32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm256_movemask_epi8(__m256i __a)
 {
   return __builtin_ia32_pmovmskb256((__v32qi)__a);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi8_epi16(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi8_epi32(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi8_epi64(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi16_epi32(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi16_epi64(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_epi64(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu8_epi16(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu8_epi32(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu8_epi64(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu16_epi32(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu16_epi64(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtepu32_epi64(__m128i __V)
 {
   return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V);
 }
 
-static __inline__  __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m256i __DEFAULT_FN_ATTRS
 _mm256_mul_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mullo_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a * (__v16hi)__b);
 }
 
-static __inline__  __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m256i __DEFAULT_FN_ATTRS
 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a * (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mul_epu32(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_or_si256(__m256i __a, __m256i __b)
 {
   return __a | __b;
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sad_epu8(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
 }
 
 #define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
-  __m256i __a = (a); \
-  (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)_mm256_set1_epi32(0), \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
+                                   (__v8si)_mm256_setzero_si256(), \
                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
                                    4 + (((imm) & 0x03) >> 0), \
@@ -495,8 +498,8 @@
                                    4 + (((imm) & 0xc0) >> 6)); })
 
 #define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
-  __m256i __a = (a); \
-  (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_setzero_si256(), \
                                    0, 1, 2, 3, \
                                    4 + (((imm) & 0x03) >> 0), \
                                    4 + (((imm) & 0x0c) >> 2), \
@@ -509,8 +512,8 @@
                                    12 + (((imm) & 0xc0) >> 6)); })
 
 #define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
-  __m256i __a = (a); \
-  (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_setzero_si256(), \
                                    (imm) & 0x3,((imm) & 0xc) >> 2, \
                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
                                    4, 5, 6, 7, \
@@ -520,283 +523,279 @@
                                    8 + (((imm) & 0xc0) >> 6), \
                                    12, 13, 14, 15); })
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sign_epi8(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sign_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sign_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
 }
 
 #define _mm256_slli_si256(a, count) __extension__ ({ \
-  __m256i __a = (a); \
-  (__m256i)__builtin_ia32_pslldqi256(__a, (count)*8); })
+  (__m256i)__builtin_ia32_pslldqi256((__m256i)(a), (count)*8); })
 
 #define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_slli_epi16(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sll_epi16(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_slli_epi32(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sll_epi32(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_slli_epi64(__m256i __a, int __count)
 {
   return __builtin_ia32_psllqi256(__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sll_epi64(__m256i __a, __m128i __count)
 {
   return __builtin_ia32_psllq256(__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srai_epi16(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sra_epi16(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srai_epi32(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sra_epi32(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
 }
 
 #define _mm256_srli_si256(a, count) __extension__ ({ \
-  __m256i __a = (a); \
-  (__m256i)__builtin_ia32_psrldqi256(__a, (count)*8); })
+  (__m256i)__builtin_ia32_psrldqi256((__m256i)(a), (count)*8); })
 
 #define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srli_epi16(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srl_epi16(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srli_epi32(__m256i __a, int __count)
 {
   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srl_epi32(__m256i __a, __m128i __count)
 {
   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srli_epi64(__m256i __a, int __count)
 {
   return __builtin_ia32_psrlqi256(__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srl_epi64(__m256i __a, __m128i __count)
 {
   return __builtin_ia32_psrlq256(__a, __count);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sub_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v32qi)__a - (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sub_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a - (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sub_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a - (__v8si)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sub_epi64(__m256i __a, __m256i __b)
 {
   return __a - __b;
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_subs_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_subs_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_subs_epu8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_subs_epu16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_xor_si256(__m256i __a, __m256i __b)
 {
   return __a ^ __b;
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
-_mm256_stream_load_si256(__m256i *__V)
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_stream_load_si256(__m256i const *__V)
 {
-  return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__V);
+  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_broadcastss_ps(__m128 __X)
 {
-  return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X);
+  return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_broadcastsd_pd(__m128d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 0);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcastss_ps(__m128 __X)
 {
-  return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X);
+  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcastsd_pd(__m128d __X)
 {
-  return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X);
+  return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastsi128_si256(__m128i __X)
 {
   return (__m256i)__builtin_shufflevector(__X, __X, 0, 1, 0, 1);
 }
 
 #define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
-  __m128i __V1 = (V1); \
-  __m128i __V2 = (V2); \
-  (__m128i)__builtin_shufflevector((__v4si)__V1, (__v4si)__V2, \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1),  \
+                                   (__v4si)(__m128i)(V2),  \
                                    (((M) & 0x01) ? 4 : 0), \
                                    (((M) & 0x02) ? 5 : 1), \
                                    (((M) & 0x04) ? 6 : 2), \
                                    (((M) & 0x08) ? 7 : 3)); })
 
 #define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
-  __m256i __V1 = (V1); \
-  __m256i __V2 = (V2); \
-  (__m256i)__builtin_shufflevector((__v8si)__V1, (__v8si)__V2, \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1),   \
+                                   (__v8si)(__m256i)(V2),   \
                                    (((M) & 0x01) ?  8 : 0), \
                                    (((M) & 0x02) ?  9 : 1), \
                                    (((M) & 0x04) ? 10 : 2), \
@@ -806,446 +805,411 @@
                                    (((M) & 0x40) ? 14 : 6), \
                                    (((M) & 0x80) ? 15 : 7)); })
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastb_epi8(__m128i __X)
 {
-  return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X);
+  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastw_epi16(__m128i __X)
 {
-  return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X);
+  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastd_epi32(__m128i __X)
 {
-  return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X);
+  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastq_epi64(__m128i __X)
 {
-  return (__m256i)__builtin_ia32_pbroadcastq256(__X);
+  return (__m256i)__builtin_shufflevector(__X, __X, 0, 0, 0, 0);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_broadcastb_epi8(__m128i __X)
 {
-  return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X);
+  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_broadcastw_epi16(__m128i __X)
 {
-  return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X);
+  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_broadcastd_epi32(__m128i __X)
 {
-  return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X);
+  return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_broadcastq_epi64(__m128i __X)
 {
-  return (__m128i)__builtin_ia32_pbroadcastq128(__X);
+  return (__m128i)__builtin_shufflevector(__X, __X, 0, 0);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
 }
 
 #define _mm256_permute4x64_pd(V, M) __extension__ ({ \
-  __m256d __V = (V); \
-  (__m256d)__builtin_shufflevector((__v4df)__V, (__v4df) _mm256_setzero_pd(), \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
+                                   (__v4df)_mm256_setzero_pd(), \
                                    (M) & 0x3, ((M) & 0xc) >> 2, \
                                    ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
-_mm256_permutevar8x32_ps(__m256 __a, __m256 __b)
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 {
-  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8sf)__b);
+  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
 }
 
 #define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
-  __m256i __V = (V); \
-  (__m256i)__builtin_shufflevector((__v4di)__V, (__v4di) _mm256_setzero_si256(), \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_setzero_si256(), \
                                    (M) & 0x3, ((M) & 0xc) >> 2, \
                                    ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
 
 #define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
-  __m256i __V1 = (V1); \
-  __m256i __V2 = (V2); \
-  (__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); })
+  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })
 
 #define _mm256_extracti128_si256(V, M) __extension__ ({ \
-  (__m128i)__builtin_shufflevector( \
-    (__v4di)(V), \
-    (__v4di)(_mm256_setzero_si256()), \
-    (((M) & 1) ? 2 : 0), \
-    (((M) & 1) ? 3 : 1) );})
+  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_setzero_si256(), \
+                                   (((M) & 1) ? 2 : 0), \
+                                   (((M) & 1) ? 3 : 1) ); })
 
 #define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
-  (__m256i)__builtin_shufflevector( \
-    (__v4di)(V1), \
-    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
-    (((M) & 1) ? 0 : 4), \
-    (((M) & 1) ? 1 : 5), \
-    (((M) & 1) ? 4 : 2), \
-    (((M) & 1) ? 5 : 3) );})
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
+                                   (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+                                   (((M) & 1) ? 0 : 4), \
+                                   (((M) & 1) ? 1 : 5), \
+                                   (((M) & 1) ? 4 : 2), \
+                                   (((M) & 1) ? 5 : 3) ); })
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskload_epi32(int const *__X, __m256i __M)
 {
   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskload_epi64(long long const *__X, __m256i __M)
 {
   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskload_epi32(int const *__X, __m128i __M)
 {
   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskload_epi64(long long const *__X, __m128i __M)
 {
   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
 {
   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
 {
   __builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
 {
   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
 {
   __builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sllv_epi32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psllv4di(__X, __Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sllv_epi64(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psllv2di(__X, __Y);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srav_epi32(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srav_epi32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srlv_epi32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_psrlv4di(__X, __Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srlv_epi64(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_psrlv2di(__X, __Y);
 }
 
 #define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
-  __m128d __a = (a); \
-  double const *__m = (m); \
-  __m128i __i = (i); \
-  __m128d __mask = (mask); \
-  (__m128d)__builtin_ia32_gatherd_pd((__v2df)__a, (const __v2df *)__m, \
-             (__v4si)__i, (__v2df)__mask, (s)); })
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
 
 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
-  __m256d __a = (a); \
-  double const *__m = (m); \
-  __m128i __i = (i); \
-  __m256d __mask = (mask); \
-  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)__a, (const __v4df *)__m, \
-             (__v4si)__i, (__v4df)__mask, (s)); })
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
 
 #define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
-  __m128d __a = (a); \
-  double const *__m = (m); \
-  __m128i __i = (i); \
-  __m128d __mask = (mask); \
-  (__m128d)__builtin_ia32_gatherq_pd((__v2df)__a, (const __v2df *)__m, \
-             (__v2di)__i, (__v2df)__mask, (s)); })
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
 
 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
-  __m256d __a = (a); \
-  double const *__m = (m); \
-  __m256i __i = (i); \
-  __m256d __mask = (mask); \
-  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)__a, (const __v4df *)__m, \
-             (__v4di)__i, (__v4df)__mask, (s)); })
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
 
 #define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
-  __m128 __a = (a); \
-  float const *__m = (m); \
-  __m128i __i = (i); \
-  __m128 __mask = (mask); \
-  (__m128)__builtin_ia32_gatherd_ps((__v4sf)__a, (const __v4sf *)__m, \
-            (__v4si)__i, (__v4sf)__mask, (s)); })
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
 
 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
-  __m256 __a = (a); \
-  float const *__m = (m); \
-  __m256i __i = (i); \
-  __m256 __mask = (mask); \
-  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)__a, (const __v8sf *)__m, \
-            (__v8si)__i, (__v8sf)__mask, (s)); })
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)(__m256)(mask), (s)); })
 
 #define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
-  __m128 __a = (a); \
-  float const *__m = (m); \
-  __m128i __i = (i); \
-  __m128 __mask = (mask); \
-  (__m128)__builtin_ia32_gatherq_ps((__v4sf)__a, (const __v4sf *)__m, \
-            (__v2di)__i, (__v4sf)__mask, (s)); })
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
 
 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
-  __m128 __a = (a); \
-  float const *__m = (m); \
-  __m256i __i = (i); \
-  __m128 __mask = (mask); \
-  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)__a, (const __v4sf *)__m, \
-            (__v4di)__i, (__v4sf)__mask, (s)); })
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)(__m128)(mask), (s)); })
 
 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
-  __m128i __a = (a); \
-  int const *__m = (m); \
-  __m128i __i = (i); \
-  __m128i __mask = (mask); \
-  (__m128i)__builtin_ia32_gatherd_d((__v4si)__a, (const __v4si *)__m, \
-            (__v4si)__i, (__v4si)__mask, (s)); })
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
 
 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
-  __m256i __a = (a); \
-  int const *__m = (m); \
-  __m256i __i = (i); \
-  __m256i __mask = (mask); \
-  (__m256i)__builtin_ia32_gatherd_d256((__v8si)__a, (const __v8si *)__m, \
-            (__v8si)__i, (__v8si)__mask, (s)); })
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+                                       (int const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8si)(__m256i)(mask), (s)); })
 
 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
-  __m128i __a = (a); \
-  int const *__m = (m); \
-  __m128i __i = (i); \
-  __m128i __mask = (mask); \
-  (__m128i)__builtin_ia32_gatherq_d((__v4si)__a, (const __v4si *)__m, \
-            (__v2di)__i, (__v4si)__mask, (s)); })
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
 
 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
-  __m128i __a = (a); \
-  int const *__m = (m); \
-  __m256i __i = (i); \
-  __m128i __mask = (mask); \
-  (__m128i)__builtin_ia32_gatherq_d256((__v4si)__a, (const __v4si *)__m, \
-            (__v4di)__i, (__v4si)__mask, (s)); })
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+                                       (int const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4si)(__m128i)(mask), (s)); })
 
 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
-  __m128i __a = (a); \
-  long long const *__m = (m); \
-  __m128i __i = (i); \
-  __m128i __mask = (mask); \
-  (__m128i)__builtin_ia32_gatherd_q((__v2di)__a, (const __v2di *)__m, \
-             (__v4si)__i, (__v2di)__mask, (s)); })
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
 
 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
-  __m256i __a = (a); \
-  long long const *__m = (m); \
-  __m128i __i = (i); \
-  __m256i __mask = (mask); \
-  (__m256i)__builtin_ia32_gatherd_q256((__v4di)__a, (const __v4di *)__m, \
-             (__v4si)__i, (__v4di)__mask, (s)); })
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
 
 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
-  __m128i __a = (a); \
-  long long const *__m = (m); \
-  __m128i __i = (i); \
-  __m128i __mask = (mask); \
-  (__m128i)__builtin_ia32_gatherq_q((__v2di)__a, (const __v2di *)__m, \
-             (__v2di)__i, (__v2di)__mask, (s)); })
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
 
 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
-  __m256i __a = (a); \
-  long long const *__m = (m); \
-  __m256i __i = (i); \
-  __m256i __mask = (mask); \
-  (__m256i)__builtin_ia32_gatherq_q256((__v4di)__a, (const __v4di *)__m, \
-             (__v4di)__i, (__v4di)__mask, (s)); })
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
 
 #define _mm_i32gather_pd(m, i, s) __extension__ ({ \
-  double const *__m = (m); \
-  __m128i __i = (i); \
-  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_setzero_pd(), \
-             (const __v2df *)__m, (__v4si)__i, \
-             (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); })
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
 
 #define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
-  double const *__m = (m); \
-  __m128i __i = (i); \
-  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_setzero_pd(), \
-             (const __v4df *)__m, (__v4si)__i, \
-             (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); })
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
 
 #define _mm_i64gather_pd(m, i, s) __extension__ ({ \
-  double const *__m = (m); \
-  __m128i __i = (i); \
-  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_setzero_pd(), \
-             (const __v2df *)__m, (__v2di)__i, \
-             (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); })
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
 
 #define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
-  double const *__m = (m); \
-  __m256i __i = (i); \
-  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_setzero_pd(), \
-             (const __v4df *)__m, (__v4di)__i, \
-             (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); })
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
 
 #define _mm_i32gather_ps(m, i, s) __extension__ ({ \
-  float const *__m = (m); \
-  __m128i __i = (i); \
-  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_setzero_ps(), \
-             (const __v4sf *)__m, (__v4si)__i, \
-             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
 
 #define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
-  float const *__m = (m); \
-  __m256i __i = (i); \
-  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_setzero_ps(), \
-             (const __v8sf *)__m, (__v8si)__i, \
-             (__v8sf)_mm256_set1_ps((float)(int)-1), (s)); })
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+                                                             _mm256_setzero_ps(), \
+                                                             _CMP_EQ_OQ), \
+                                       (s)); })
 
 #define _mm_i64gather_ps(m, i, s) __extension__ ({ \
-  float const *__m = (m); \
-  __m128i __i = (i); \
-  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_setzero_ps(), \
-             (const __v4sf *)__m, (__v2di)__i, \
-             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
 
 #define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
-  float const *__m = (m); \
-  __m256i __i = (i); \
-  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_setzero_ps(), \
-             (const __v4sf *)__m, (__v4di)__i, \
-             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                            _mm_setzero_ps()), \
+                                       (s)); })
 
 #define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
-  int const *__m = (m); \
-  __m128i __i = (i); \
-  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_setzero_si128(), \
-            (const __v4si *)__m, (__v4si)__i, \
-            (__v4si)_mm_set1_epi32(-1), (s)); })
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v4si)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
 
 #define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
-  int const *__m = (m); \
-  __m256i __i = (i); \
-  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_setzero_si256(), \
-            (const __v8si *)__m, (__v8si)__i, \
-            (__v8si)_mm256_set1_epi32(-1), (s)); })
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+                                       (int const *)(m), (__v8si)(__m256i)(i), \
+                                       (__v8si)_mm256_set1_epi32(-1), (s)); })
 
 #define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
-  int const *__m = (m); \
-  __m128i __i = (i); \
-  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_setzero_si128(), \
-            (const __v4si *)__m, (__v2di)__i, \
-            (__v4si)_mm_set1_epi32(-1), (s)); })
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v2di)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
 
 #define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
-  int const *__m = (m); \
-  __m256i __i = (i); \
-  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_setzero_si128(), \
-            (const __v4si *)__m, (__v4di)__i, \
-            (__v4si)_mm_set1_epi32(-1), (s)); })
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+                                       (int const *)(m), (__v4di)(__m256i)(i), \
+                                       (__v4si)_mm_set1_epi32(-1), (s)); })
 
 #define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
-  long long const *__m = (m); \
-  __m128i __i = (i); \
-  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_setzero_si128(), \
-             (const __v2di *)__m, (__v4si)__i, \
-             (__v2di)_mm_set1_epi64x(-1), (s)); })
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
 
 #define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
-  long long const *__m = (m); \
-  __m128i __i = (i); \
-  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_setzero_si256(), \
-             (const __v4di *)__m, (__v4si)__i, \
-             (__v4di)_mm256_set1_epi64x(-1), (s)); })
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
 
 #define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
-  long long const *__m = (m); \
-  __m128i __i = (i); \
-  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_setzero_si128(), \
-             (const __v2di *)__m, (__v2di)__i, \
-             (__v2di)_mm_set1_epi64x(-1), (s)); })
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
 
 #define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
-  long long const *__m = (m); \
-  __m256i __i = (i); \
-  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_setzero_si256(), \
-             (const __v4di *)__m, (__v4di)__i, \
-             (__v4di)_mm256_set1_epi64x(-1), (s)); })
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#undef __DEFAULT_FN_ATTRS
 
 #endif /* __AVX2INTRIN_H */
diff --git a/renderscript/clang-include/avx512bwintrin.h b/renderscript/clang-include/avx512bwintrin.h
index d0591e4..f289ed7 100644
--- a/renderscript/clang-include/avx512bwintrin.h
+++ b/renderscript/clang-include/avx512bwintrin.h
@@ -33,8 +33,11 @@
 typedef char __v64qi __attribute__ ((__vector_size__ (64)));
 typedef short __v32hi __attribute__ ((__vector_size__ (64)));
 
-static  __inline __v64qi __attribute__ ((__always_inline__, __nodebug__))
-_mm512_setzero_qi (void) {
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
+
+static  __inline __v64qi __DEFAULT_FN_ATTRS
+_mm512_setzero_qi(void) {
   return (__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0,
@@ -45,8 +48,8 @@
                        0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
-static  __inline __v32hi __attribute__ ((__always_inline__, __nodebug__))
-_mm512_setzero_hi (void) {
+static  __inline __v32hi __DEFAULT_FN_ATTRS
+_mm512_setzero_hi(void) {
   return (__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0,
@@ -55,300 +58,300 @@
 
 /* Integer compare */
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
                                                    (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
                                                  (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
                                                    (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
                                                  __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
                                                 (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
                                                  (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
                                                 (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
                                                  __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
                                                    (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
                                                  (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
                                                    (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
                                                  __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
                                                 (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
                                                  (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
                                                 (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
                                                  __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
                                                 (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
                                                  (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
                                                 (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
                                                  __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
                                                 (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
                                                  (__mmask64)-1);
 }
 
-static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
                                                 (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
                                                  __u);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi8 (__m512i __A, __m512i __B) {
   return (__m512i) ((__v64qi) __A + (__v64qi) __B);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
              (__v64qi) __B,
@@ -356,21 +359,20 @@
              (__mmask64) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
              (__v64qi) __B,
-             (__v64qi)
-             _mm512_setzero_qi (),
+             (__v64qi) _mm512_setzero_qi(),
              (__mmask64) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sub_epi8 (__m512i __A, __m512i __B) {
   return (__m512i) ((__v64qi) __A - (__v64qi) __B);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
              (__v64qi) __B,
@@ -378,21 +380,20 @@
              (__mmask64) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
              (__v64qi) __B,
-             (__v64qi)
-             _mm512_setzero_qi (),
+             (__v64qi) _mm512_setzero_qi(),
              (__mmask64) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi16 (__m512i __A, __m512i __B) {
   return (__m512i) ((__v32hi) __A + (__v32hi) __B);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
              (__v32hi) __B,
@@ -400,21 +401,20 @@
              (__mmask32) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
              (__v32hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
+             (__v32hi) _mm512_setzero_hi(),
              (__mmask32) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sub_epi16 (__m512i __A, __m512i __B) {
   return (__m512i) ((__v32hi) __A - (__v32hi) __B);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
              (__v32hi) __B,
@@ -422,21 +422,20 @@
              (__mmask32) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
              (__v32hi) __B,
-             (__v32hi)
-             _mm512_setzero_hi (),
+             (__v32hi) _mm512_setzero_hi(),
              (__mmask32) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mullo_epi16 (__m512i __A, __m512i __B) {
   return (__m512i) ((__v32hi) __A * (__v32hi) __B);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
               (__v32hi) __B,
@@ -444,15 +443,1059 @@
               (__mmask32) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
               (__v32hi) __B,
-              (__v32hi)
-              _mm512_setzero_hi (),
+              (__v32hi) _mm512_setzero_hi(),
               (__mmask32) __U);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi8 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi16 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
+         __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+              (__v32hi) __I /* idx */ ,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
+              /* idx */ ,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) __W,
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
+         __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
+      __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)_mm256_setzero_si256(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)__O,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi) _mm256_setzero_si256(),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) __O,
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+         __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+          __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+         __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+          __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
 #define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
                                          (__v64qi)(__m512i)(b), \
@@ -493,4 +1536,7 @@
                                           (__v32hi)(__m512i)(b), \
                                           (p), (__mmask32)(m)); })
 
+
+#undef __DEFAULT_FN_ATTRS
+
 #endif
diff --git a/renderscript/clang-include/avx512cdintrin.h b/renderscript/clang-include/avx512cdintrin.h
new file mode 100644
index 0000000..3894b29
--- /dev/null
+++ b/renderscript/clang-include/avx512cdintrin.h
@@ -0,0 +1,131 @@
+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512CDINTRIN_H
+#define __AVX512CDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+               (__v8di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/renderscript/clang-include/avx512dqintrin.h b/renderscript/clang-include/avx512dqintrin.h
index fd33be2..afee490 100644
--- a/renderscript/clang-include/avx512dqintrin.h
+++ b/renderscript/clang-include/avx512dqintrin.h
@@ -28,12 +28,15 @@
 #ifndef __AVX512DQINTRIN_H
 #define __AVX512DQINTRIN_H
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mullo_epi64 (__m512i __A, __m512i __B) {
   return (__m512i) ((__v8di) __A * (__v8di) __B);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
               (__v8di) __B,
@@ -41,7 +44,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
   return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
               (__v8di) __B,
@@ -50,12 +53,12 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_xor_pd (__m512d __A, __m512d __B) {
   return (__m512d) ((__v8di) __A ^ (__v8di) __B);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
              (__v8df) __B,
@@ -63,7 +66,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
              (__v8df) __B,
@@ -72,12 +75,12 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_xor_ps (__m512 __A, __m512 __B) {
   return (__m512) ((__v16si) __A ^ (__v16si) __B);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
             (__v16sf) __B,
@@ -85,7 +88,7 @@
             (__mmask16) __U);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
             (__v16sf) __B,
@@ -94,12 +97,12 @@
             (__mmask16) __U);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_or_pd (__m512d __A, __m512d __B) {
   return (__m512d) ((__v8di) __A | (__v8di) __B);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
             (__v8df) __B,
@@ -107,7 +110,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
             (__v8df) __B,
@@ -116,12 +119,12 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_or_ps (__m512 __A, __m512 __B) {
   return (__m512) ((__v16si) __A | (__v16si) __B);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
                  (__v16sf) __B,
@@ -129,7 +132,7 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
                  (__v16sf) __B,
@@ -138,12 +141,12 @@
                  (__mmask16) __U);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_and_pd (__m512d __A, __m512d __B) {
   return (__m512d) ((__v8di) __A & (__v8di) __B);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
              (__v8df) __B,
@@ -151,7 +154,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
              (__v8df) __B,
@@ -160,12 +163,12 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_and_ps (__m512 __A, __m512 __B) {
   return (__m512) ((__v16si) __A & (__v16si) __B);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
             (__v16sf) __B,
@@ -173,7 +176,7 @@
             (__mmask16) __U);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
             (__v16sf) __B,
@@ -182,7 +185,7 @@
             (__mmask16) __U);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_andnot_pd (__m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
               (__v8df) __B,
@@ -191,7 +194,7 @@
               (__mmask8) -1);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
               (__v8df) __B,
@@ -199,7 +202,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
   return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
               (__v8df) __B,
@@ -208,7 +211,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_andnot_ps (__m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
              (__v16sf) __B,
@@ -217,7 +220,7 @@
              (__mmask16) -1);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
              (__v16sf) __B,
@@ -225,7 +228,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
   return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
              (__v16sf) __B,
@@ -234,4 +237,542 @@
              (__mmask16) __U);
 }
 
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,               \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundpd_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,                 \
+                (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundpd_epi64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,        \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu64(__A, __R) __extension__ ({               \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,               \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundpd_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,                \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundpd_epu64(__U, __A, __R) __extension__ ({     \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,                \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi64(__A, __R) __extension__ ({             \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,              \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundps_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,                 \
+                (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundps_epi64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,               \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundps_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,                \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundps_epu64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_pd(__A, __R) __extension__ ({          \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,           \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepi64_pd(__W, __U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,                 \
+                (__v8df) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepi64_pd(__U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,             \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) __W,
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_ps(__A, __R) __extension__ ({        \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,          \
+               (__v8sf) _mm256_setzero_ps(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepi64_ps(__W, __U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,                  \
+               (__v8sf) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepi64_ps(__U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,              \
+               (__v8sf) _mm256_setzero_ps(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi64(__A, __R) __extension__ ({             \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundpd_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,                 \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundpd_epi64(__U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,             \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epu64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,              \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundpd_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,                \
+                  (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundpd_epu64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,              \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi64(__A, __R) __extension__ ({            \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,             \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundps_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,                 \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundps_epi64(__U, __A, __R) __extension__ ({  \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epu64(__A, __R) __extension__ ({            \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,            \
+                  (__v8di) _mm512_setzero_si512(),(__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundps_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,                \
+                  (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundps_epu64(__U, __A, __R) __extension__ ({  \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,             \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_pd(__A, __R) __extension__ ({          \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,          \
+                 (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepu64_pd(__W, __U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,                \
+                 (__v8df) __W, (__mmask8) __U, __R);})
+
+
+#define _mm512_maskz_cvt_roundepu64_pd(__U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,            \
+                 (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_ps(__A, __R) __extension__ ({         \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,          \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepu64_ps(__W, __U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,                 \
+                (__v8sf) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepu64_ps(__U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,             \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U, __R);})
+
+#define _mm512_range_pd(__A, __B, __C) __extension__ ({                     \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,\
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({      \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,\
+               (__v8df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_range_pd(__U, __A, __B, __C) __extension__ ({           \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) __U,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_range_round_pd(__A, __B, __C, __R) __extension__ ({           \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_range_round_pd(__W, __U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,      \
+               (__v8df) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_range_round_pd(__U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,   \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_range_ps(__A, __B, __C) __extension__ ({                       \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B, __C, \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({         \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,       \
+               __C, (__v16sf) __W, (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_range_ps(__U, __A, __B, __C) __extension__ ({      \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,(__v16sf) __B, \
+              __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) __U,      \
+              _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_range_round_ps(__A, __B, __C, __R) __extension__ ({         \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,   \
+                __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R);})
+
+#define _mm512_mask_range_round_ps(__W, __U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,          \
+                __C, (__v16sf) __W, (__mmask16) __U, __R);})
+
+#define _mm512_maskz_range_round_ps(__U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,      \
+                __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
+
+#define _mm512_reduce_pd(__A, __B) __extension__ ({             \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,    \
+                (__v8df) __W,(__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_reduce_pd(__U, __A, __B) __extension__ ({  \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_reduce_ps(__A, __B) __extension__ ({              \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,  \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({   \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) __W, (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_reduce_ps(__U, __A, __B) __extension__ ({       \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_reduce_round_pd(__A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_reduce_round_pd(__W, __U, __A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,    \
+                (__v8df) __W,(__mmask8) __U, __R);})
+
+#define _mm512_maskz_reduce_round_pd(__U, __A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_reduce_round_ps(__A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,  \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R);})
+
+#define _mm512_mask_reduce_round_ps(__W, __U, __A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) __W, (__mmask16) __U, __R);})
+
+#define _mm512_maskz_reduce_round_ps(__U, __A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
+
+#undef __DEFAULT_FN_ATTRS
+
 #endif
diff --git a/renderscript/clang-include/avx512erintrin.h b/renderscript/clang-include/avx512erintrin.h
index 57c61aa..40a9121 100644
--- a/renderscript/clang-include/avx512erintrin.h
+++ b/renderscript/clang-include/avx512erintrin.h
@@ -1,4 +1,4 @@
-/*===---- avx512fintrin.h - AVX2 intrinsics -----------------------------------===
+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,6 @@
 #ifndef __AVX512ERINTRIN_H
 #define __AVX512ERINTRIN_H
 
-
 // exp2a23
 #define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
   (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
@@ -127,19 +126,19 @@
   _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
                                         (__v4sf)(__m128)(B), \
                                         (__v4sf)_mm_setzero_ps(), \
                                         (__mmask8)-1, (R)); })
 
 #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
                                         (__v4sf)(__m128)(B), \
                                         (__v4sf)(__m128)(S), \
                                         (__mmask8)(M), (R)); })
 
 #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
                                         (__v4sf)(__m128)(B), \
                                         (__v4sf)_mm_setzero_ps(), \
                                         (__mmask8)(M), (R)); })
@@ -154,19 +153,19 @@
   _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
                                          (__v2df)(__m128d)(B), \
                                          (__v2df)_mm_setzero_pd(), \
                                          (__mmask8)-1, (R)); })
 
 #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
                                          (__v2df)(__m128d)(B), \
                                          (__v2df)(__m128d)(S), \
                                          (__mmask8)(M), (R)); })
 
 #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
                                          (__v2df)(__m128d)(B), \
                                          (__v2df)_mm_setzero_pd(), \
                                          (__mmask8)(M), (R)); })
@@ -230,19 +229,19 @@
   _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
                                       (__v4sf)(__m128)(B), \
                                       (__v4sf)_mm_setzero_ps(), \
                                       (__mmask8)-1, (R)); })
 
 #define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
                                       (__v4sf)(__m128)(B), \
                                       (__v4sf)(__m128)(S), \
                                       (__mmask8)(M), (R)); })
 
 #define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
-  (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
                                       (__v4sf)(__m128)(B), \
                                       (__v4sf)_mm_setzero_ps(), \
                                       (__mmask8)(M), (R)); })
@@ -257,19 +256,19 @@
   _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
                                        (__v2df)(__m128d)(B), \
                                        (__v2df)_mm_setzero_pd(), \
                                        (__mmask8)-1, (R)); })
 
 #define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
                                        (__v2df)(__m128d)(B), \
                                        (__v2df)(__m128d)(S), \
                                        (__mmask8)(M), (R)); })
 
 #define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
-  (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
                                        (__v2df)(__m128d)(B), \
                                        (__v2df)_mm_setzero_pd(), \
                                        (__mmask8)(M), (R)); })
diff --git a/renderscript/clang-include/avx512fintrin.h b/renderscript/clang-include/avx512fintrin.h
index d299704..8dcdc71 100644
--- a/renderscript/clang-include/avx512fintrin.h
+++ b/renderscript/clang-include/avx512fintrin.h
@@ -1,4 +1,4 @@
-/*===---- avx512fintrin.h - AVX2 intrinsics --------------------------------===
+/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -46,15 +46,42 @@
 #define _MM_FROUND_TO_ZERO          0x03
 #define _MM_FROUND_CUR_DIRECTION    0x04
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+
 /* Create vectors with repeated elements */
 
-static  __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static  __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_setzero_si512(void)
 {
   return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_undefined_pd()
+{
+  return (__m512d)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined()
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined_ps()
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_undefined_epi32()
+{
+  return (__m512i)__builtin_ia32_undef512();
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
 {
   return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
@@ -63,7 +90,7 @@
                  __M);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
 {
 #ifdef __x86_64__
@@ -79,45 +106,45 @@
 #endif
 }
 
-static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_setzero_ps(void)
 {
   return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
 }
-static  __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+static  __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_setzero_pd(void)
 {
   return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_set1_ps(float __w)
 {
   return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
                    __w, __w, __w, __w, __w, __w, __w, __w  };
 }
 
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_set1_pd(double __w)
 {
   return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
-static __inline __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_set1_epi32(int __s)
 {
   return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
                              __s, __s, __s, __s, __s, __s, __s, __s };
 }
 
-static __inline __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_set1_epi64(long long __d)
 {
   return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
 }
 
-static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_broadcastss_ps(__m128 __X)
 {
   float __f = __X[0];
@@ -127,7 +154,7 @@
                     __f, __f, __f, __f };
 }
 
-static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_broadcastsd_pd(__m128d __X)
 {
   double __d = __X[0];
@@ -137,39 +164,39 @@
 
 /* Cast between vector types */
 
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_castpd256_pd512(__m256d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_castps256_ps512(__m256 __a)
 {
   return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
                                           -1, -1, -1, -1, -1, -1, -1, -1);
 }
 
-static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline __m128d __DEFAULT_FN_ATTRS
 _mm512_castpd512_pd128(__m512d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1);
 }
 
-static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline __m128 __DEFAULT_FN_ATTRS
 _mm512_castps512_ps128(__m512 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
 }
 
 /* Bitwise operators */
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_epi32(__m512i __a, __m512i __b)
 {
   return __a & __b;
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
@@ -177,7 +204,7 @@
               (__v16si) __src,
               (__mmask16) __k);
 }
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
@@ -187,13 +214,13 @@
               (__mmask16) __k);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_epi64(__m512i __a, __m512i __b)
 {
   return __a & __b;
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
@@ -201,7 +228,7 @@
               (__v8di) __src,
               (__mmask8) __k);
 }
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
@@ -211,7 +238,7 @@
               (__mmask8) __k);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_andnot_epi32 (__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
@@ -221,7 +248,7 @@
               (__mmask16) -1);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
@@ -230,7 +257,7 @@
               (__mmask16) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
@@ -240,7 +267,7 @@
               (__mmask16) __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_andnot_epi64 (__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
@@ -250,7 +277,7 @@
               (__mmask8) -1);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
@@ -258,7 +285,7 @@
               (__v8di) __W, __U);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
@@ -267,13 +294,13 @@
               _mm512_setzero_pd (),
               __U);
 }
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_or_epi32(__m512i __a, __m512i __b)
 {
   return __a | __b;
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
@@ -281,7 +308,7 @@
               (__v16si) __src,
               (__mmask16) __k);
 }
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
@@ -291,13 +318,13 @@
               (__mmask16) __k);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_or_epi64(__m512i __a, __m512i __b)
 {
   return __a | __b;
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
@@ -305,7 +332,7 @@
               (__v8di) __src,
               (__mmask8) __k);
 }
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
@@ -315,13 +342,13 @@
               (__mmask8) __k);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_xor_epi32(__m512i __a, __m512i __b)
 {
   return __a ^ __b;
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
@@ -329,7 +356,7 @@
               (__v16si) __src,
               (__mmask16) __k);
 }
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
@@ -339,13 +366,13 @@
               (__mmask16) __k);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_xor_epi64(__m512i __a, __m512i __b)
 {
   return __a ^ __b;
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
@@ -353,7 +380,7 @@
               (__v8di) __src,
               (__mmask8) __k);
 }
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 {
   return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
@@ -363,68 +390,68 @@
               (__mmask8) __k);
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_si512(__m512i __a, __m512i __b)
 {
   return __a & __b;
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_or_si512(__m512i __a, __m512i __b)
 {
   return __a | __b;
 }
 
-static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_xor_si512(__m512i __a, __m512i __b)
 {
   return __a ^ __b;
 }
 /* Arithmetic */
 
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_add_pd(__m512d __a, __m512d __b)
 {
   return __a + __b;
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_add_ps(__m512 __a, __m512 __b)
 {
   return __a + __b;
 }
 
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_mul_pd(__m512d __a, __m512d __b)
 {
   return __a * __b;
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_mul_ps(__m512 __a, __m512 __b)
 {
   return __a * __b;
 }
 
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_sub_pd(__m512d __a, __m512d __b)
 {
   return __a - __b;
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_sub_ps(__m512 __a, __m512 __b)
 {
   return __a - __b;
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi64 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v8di) __A + (__v8di) __B);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
@@ -433,7 +460,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
@@ -443,13 +470,13 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sub_epi64 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v8di) __A - (__v8di) __B);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
@@ -458,7 +485,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
@@ -468,13 +495,13 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi32 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v16si) __A + (__v16si) __B);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
@@ -483,7 +510,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
@@ -493,13 +520,13 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_sub_epi32 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v16si) __A - (__v16si) __B);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
@@ -508,7 +535,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
@@ -518,7 +545,7 @@
              (__mmask16) __U);
 }
 
-static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_max_pd(__m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
@@ -529,7 +556,7 @@
              _MM_FROUND_CUR_DIRECTION);
 }
 
-static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_max_ps(__m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
@@ -540,8 +567,68 @@
             _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_max_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_max_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_max_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_max_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
 static __inline __m512i
-__attribute__ ((__always_inline__, __nodebug__))
+__DEFAULT_FN_ATTRS
 _mm512_max_epi32(__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
@@ -551,7 +638,7 @@
               (__mmask16) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epu32(__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
@@ -561,7 +648,7 @@
               (__mmask16) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epi64(__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
@@ -571,7 +658,7 @@
               (__mmask8) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_max_epu64(__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
@@ -581,7 +668,7 @@
               (__mmask8) -1);
 }
 
-static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_min_pd(__m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
@@ -592,7 +679,7 @@
              _MM_FROUND_CUR_DIRECTION);
 }
 
-static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_min_ps(__m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
@@ -603,8 +690,68 @@
             _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_min_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_min_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_min_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_min_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
 static __inline __m512i
-__attribute__ ((__always_inline__, __nodebug__))
+__DEFAULT_FN_ATTRS
 _mm512_min_epi32(__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
@@ -614,7 +761,7 @@
               (__mmask16) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epu32(__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
@@ -624,7 +771,7 @@
               (__mmask16) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epi64(__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
@@ -634,7 +781,7 @@
               (__mmask8) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_min_epu64(__m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
@@ -644,7 +791,7 @@
               (__mmask8) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mul_epi32(__m512i __X, __m512i __Y)
 {
   return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
@@ -654,7 +801,7 @@
               (__mmask8) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
   return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
@@ -662,7 +809,7 @@
               (__v8di) __W, __M);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
 {
   return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
@@ -672,7 +819,7 @@
               __M);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mul_epu32(__m512i __X, __m512i __Y)
 {
   return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
@@ -682,7 +829,7 @@
                (__mmask8) -1);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
   return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
@@ -690,7 +837,7 @@
                (__v8di) __W, __M);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
 {
   return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
@@ -700,13 +847,13 @@
                __M);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mullo_epi32 (__m512i __A, __m512i __B)
 {
   return (__m512i) ((__v16si) __A * (__v16si) __B);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
@@ -716,7 +863,7 @@
               __M);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
@@ -724,25 +871,25 @@
               (__v16si) __W, __M);
 }
 
-static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
-_mm512_sqrt_pd(__m512d a)
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_sqrt_pd(__m512d __a)
 {
-  return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)a,
+  return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a,
                                                 (__v8df) _mm512_setzero_pd (),
                                                 (__mmask8) -1,
                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
-_mm512_sqrt_ps(__m512 a)
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_sqrt_ps(__m512 __a)
 {
-  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)a,
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a,
                                                (__v16sf) _mm512_setzero_ps (),
                                                (__mmask16) -1,
                                                _MM_FROUND_CUR_DIRECTION);
 }
 
-static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_rsqrt14_pd(__m512d __A)
 {
   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
@@ -750,7 +897,7 @@
                  _mm512_setzero_pd (),
                  (__mmask8) -1);}
 
-static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_rsqrt14_ps(__m512 __A)
 {
   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
@@ -759,27 +906,27 @@
                 (__mmask16) -1);
 }
 
-static  __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt14_ss(__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+  return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __A,
              (__v4sf) __B,
              (__v4sf)
              _mm_setzero_ps (),
              (__mmask8) -1);
 }
 
-static  __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_rsqrt14_sd(__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
+  return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __A,
               (__v2df) __B,
               (__v2df)
               _mm_setzero_pd (),
               (__mmask8) -1);
 }
 
-static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_rcp14_pd(__m512d __A)
 {
   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
@@ -788,7 +935,7 @@
                (__mmask8) -1);
 }
 
-static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_rcp14_ps(__m512 __A)
 {
   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
@@ -796,27 +943,27 @@
               _mm512_setzero_ps (),
               (__mmask16) -1);
 }
-static  __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp14_ss(__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+  return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __A,
                  (__v4sf) __B,
                  (__v4sf)
                  _mm_setzero_ps (),
                  (__mmask8) -1);
 }
 
-static  __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_rcp14_sd(__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
+  return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __A,
             (__v2df) __B,
             (__v2df)
             _mm_setzero_pd (),
             (__mmask8) -1);
 }
 
-static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_floor_ps(__m512 __A)
 {
   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
@@ -825,7 +972,7 @@
                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_floor_pd(__m512d __A)
 {
   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
@@ -834,7 +981,7 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_ceil_ps(__m512 __A)
 {
   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
@@ -843,7 +990,7 @@
                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_ceil_pd(__m512d __A)
 {
   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
@@ -852,7 +999,7 @@
                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512i __attribute__ (( __always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_abs_epi64(__m512i __A)
 {
   return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
@@ -861,7 +1008,7 @@
              (__mmask8) -1);
 }
 
-static __inline __m512i __attribute__ (( __always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_abs_epi32(__m512i __A)
 {
   return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
@@ -870,6 +1017,489 @@
              (__mmask16) -1);
 }
 
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_add_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_add_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_add_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_add_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_add_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_add_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_add_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_add_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask((__v8df) __A, (__v8df) __B, \
+                (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_add_round_pd(__U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R); })
+
+#define _mm512_add_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R); })
+
+#define _mm512_mask_add_round_ps(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) __W, (__mmask16)__U, __R); })
+
+#define _mm512_maskz_add_round_ps(__U, __A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) _mm512_setzero_ps(), (__mmask16)__U, __R); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_sub_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_sub_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_sub_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_sub_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_sub_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_sub_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_sub_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_sub_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_sub_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_sub_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_sub_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_sub_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_mul_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_mul_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_mul_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mul_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_mul_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_mul_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mul_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_mul_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_mul_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_mul_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_mul_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_mul_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_div_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_div_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_div_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_div_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_div_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_div_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_div_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_div_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_div_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_div_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
 #define _mm512_roundscale_ps(A, B) __extension__ ({ \
   (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(A), (B), (__v16sf)(A), \
                                          -1, _MM_FROUND_CUR_DIRECTION); })
@@ -878,75 +1508,779 @@
   (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(A), (B), (__v8df)(A), \
                                           -1, _MM_FROUND_CUR_DIRECTION); })
 
-static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) (A), \
+                                              (__v8df) (B), -(__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) (A), \
+                                              (__v8df) (B), -(__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d)
-    __builtin_ia32_vfmaddpd512_mask(__A,
-                                    __B,
-                                    __C,
-                                    (__mmask8) -1,
-                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d)
-    __builtin_ia32_vfmsubpd512_mask(__A,
-                                    __B,
-                                    __C,
-                                    (__mmask8) -1,
-                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d)
-    __builtin_ia32_vfnmaddpd512_mask(__A,
-                                     __B,
-                                     __C,
-                                     (__mmask8) -1,
-                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) (A), \
+                                             (__v16sf) (B), -(__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) (A), \
+                                             (__v16sf) (B), -(__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512)
-    __builtin_ia32_vfmaddps512_mask(__A,
-                                    __B,
-                                    __C,
-                                    (__mmask16) -1,
-                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512)
-    __builtin_ia32_vfmsubps512_mask(__A,
-                                    __B,
-                                    __C,
-                                    (__mmask16) -1,
-                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512)
-    __builtin_ia32_vfnmaddps512_mask(__A,
-                                     __B,
-                                     __C,
-                                     (__mmask16) -1,
-                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), (__v8df) (C), \
+                                                (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), (__v8df) (C), \
+                                                (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), -(__v8df) (C), \
+                                                (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), -(__v8df) (C), \
+                                                (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) (A), \
+                                                 (__v8df) (B), -(__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        -(__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), (__v16sf) (C), \
+                                               (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), (__v16sf) (C), \
+                                               (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), -(__v16sf) (C), \
+                                               (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), -(__v16sf) (C), \
+                                               (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) (A), \
+                                                (__v16sf) (B), -(__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       -(__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) (A), \
+                                               (__v8df) (B), (__v8df) (C), \
+                                               (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) (A), \
+                                              (__v16sf) (B), (__v16sf) (C), \
+                                              (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+
+
 /* Vector permutations */
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
 {
   return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
@@ -955,7 +2289,7 @@
                                                        (__v16si) __B,
                                                        (__mmask16) -1);
 }
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
 {
   return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
@@ -965,7 +2299,7 @@
                                                        (__mmask8) -1);
 }
 
-static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
 {
   return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
@@ -974,7 +2308,7 @@
                                                         (__v8df) __B,
                                                         (__mmask8) -1);
 }
-static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
 {
   return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
@@ -999,24 +2333,22 @@
 /* Vector Extract */
 
 #define _mm512_extractf64x4_pd(A, I) __extension__ ({                    \
-      __m512d __A = (A);                                                 \
       (__m256d)                                                          \
-        __builtin_ia32_extractf64x4_mask((__v8df)__A,                    \
+        __builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A),           \
                                          (I),                            \
                                          (__v4df)_mm256_setzero_si256(), \
                                          (__mmask8) -1); })
 
 #define _mm512_extractf32x4_ps(A, I) __extension__ ({                    \
-      __m512 __A = (A);                                                  \
       (__m128)                                                           \
-        __builtin_ia32_extractf32x4_mask((__v16sf)__A,                   \
+        __builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A),           \
                                          (I),                            \
                                          (__v4sf)_mm_setzero_ps(),       \
                                          (__mmask8) -1); })
 
 /* Vector Blend */
 
-static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
 {
   return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
@@ -1024,7 +2356,7 @@
                  (__mmask8) __U);
 }
 
-static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
 {
   return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
@@ -1032,7 +2364,7 @@
                 (__mmask16) __U);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
 {
   return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
@@ -1040,7 +2372,7 @@
                 (__mmask8) __U);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
 {
   return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
@@ -1084,7 +2416,7 @@
 
 /* Conversion */
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_cvttps_epu32(__m512 __A)
 {
   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
@@ -1104,7 +2436,7 @@
                                            (__v16sf)_mm512_setzero_ps(), \
                                            (__mmask16)-1, (R)); })
 
-static __inline __m512d __attribute__ (( __always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepi32_pd(__m256i __A)
 {
   return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
@@ -1113,7 +2445,7 @@
                 (__mmask8) -1);
 }
 
-static __inline __m512d __attribute__ (( __always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_cvtepu32_pd(__m256i __A)
 {
   return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
@@ -1132,7 +2464,7 @@
                                             (__v16hi)_mm256_setzero_si256(), \
                                             -1); })
 
-static  __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+static  __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_cvtph_ps(__m256i __A)
 {
   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
@@ -1142,19 +2474,19 @@
                 _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m512i __attribute__((__always_inline__, __nodebug__))
-_mm512_cvttps_epi32(__m512 a)
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi32(__m512 __a)
 {
   return (__m512i)
-    __builtin_ia32_cvttps2dq512_mask((__v16sf) a,
+    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
                                      (__v16si) _mm512_setzero_si512 (),
                                      (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
-_mm512_cvttpd_epi32(__m512d a)
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi32(__m512d __a)
 {
-  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) a,
+  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
                                                    (__v8si)_mm256_setzero_si256(),
                                                    (__mmask8) -1,
                                                     _MM_FROUND_CUR_DIRECTION);
@@ -1191,19 +2523,19 @@
                                             (__mmask8) -1, (R)); })
 
 /* Unpack and Interleave */
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_unpackhi_pd(__m512d __a, __m512d __b)
 {
   return __builtin_shufflevector(__a, __b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
 }
 
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_unpacklo_pd(__m512d __a, __m512d __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_unpackhi_ps(__m512 __a, __m512 __b)
 {
   return __builtin_shufflevector(__a, __b,
@@ -1213,7 +2545,7 @@
                                  2+12, 18+12, 3+12, 19+12);
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_unpacklo_ps(__m512 __a, __m512 __b)
 {
   return __builtin_shufflevector(__a, __b,
@@ -1225,7 +2557,7 @@
 
 /* Bit Test */
 
-static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __mmask16 __DEFAULT_FN_ATTRS
 _mm512_test_epi32_mask(__m512i __A, __m512i __B)
 {
   return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
@@ -1233,7 +2565,7 @@
             (__mmask16) -1);
 }
 
-static __inline __mmask8 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __mmask8 __DEFAULT_FN_ATTRS
 _mm512_test_epi64_mask(__m512i __A, __m512i __B)
 {
   return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
@@ -1243,7 +2575,7 @@
 
 /* SIMD load ops */
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *)__P,
@@ -1252,7 +2584,7 @@
                                                      (__mmask16) __U);
 }
 
-static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
 {
   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *)__P,
@@ -1261,7 +2593,7 @@
                                                      (__mmask8) __U);
 }
 
-static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
 {
   return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *)__P,
@@ -1270,7 +2602,7 @@
                                                   (__mmask16) __U);
 }
 
-static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
 {
   return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *)__P,
@@ -1279,7 +2611,7 @@
                                                    (__mmask8) __U);
 }
 
-static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
 {
   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
@@ -1288,7 +2620,7 @@
                                                   (__mmask16) __U);
 }
 
-static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_load_pd(__mmask8 __U, void const *__P)
 {
   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
@@ -1297,7 +2629,7 @@
                                                    (__mmask8) __U);
 }
 
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_loadu_pd(double const *__p)
 {
   struct __loadu_pd {
@@ -1306,7 +2638,7 @@
   return ((struct __loadu_pd*)__p)->__v;
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_loadu_ps(float const *__p)
 {
   struct __loadu_ps {
@@ -1315,7 +2647,7 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
-static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_load_ps(double const *__p)
 {
   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
@@ -1324,7 +2656,7 @@
                                                   (__mmask16) -1);
 }
 
-static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+static __inline __m512d __DEFAULT_FN_ATTRS
 _mm512_load_pd(float const *__p)
 {
   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
@@ -1335,65 +2667,65 @@
 
 /* SIMD store ops */
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
 {
   __builtin_ia32_storedqudi512_mask ((__v8di *)__P, (__v8di) __A,
                                      (__mmask8) __U);
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
 {
   __builtin_ia32_storedqusi512_mask ((__v16si *)__P, (__v16si) __A,
                                      (__mmask16) __U);
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
 {
   __builtin_ia32_storeupd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_storeu_pd(void *__P, __m512d __A)
 {
   __builtin_ia32_storeupd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)-1);
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
 {
   __builtin_ia32_storeups512_mask ((__v16sf *)__P, (__v16sf) __A,
                                    (__mmask16) __U);
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_storeu_ps(void *__P, __m512 __A)
 {
   __builtin_ia32_storeups512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)-1);
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
 {
   __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_store_pd(void *__P, __m512d __A)
 {
   *(__m512d*)__P = __A;
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
 {
   __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
                                    (__mmask16) __U);
 }
 
-static __inline void __attribute__ ((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm512_store_ps(void *__P, __m512 __A)
 {
   *(__m512*)__P = __A;
@@ -1401,7 +2733,7 @@
 
 /* Mask ops */
 
-static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
+static __inline __mmask16 __DEFAULT_FN_ATTRS
 _mm512_knot(__mmask16 __M)
 {
   return __builtin_ia32_knothi(__M);
@@ -1409,339 +2741,334 @@
 
 /* Integer compare */
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
                                                    (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
                                                    __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
                                                    (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
                                                    __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
                                                 __u);
 }
 
 #define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
-  __m512i __a = (a); \
-  __m512i __b = (b); \
-  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (p), \
                                          (__mmask16)-1); })
 
 #define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
-  __m512i __a = (a); \
-  __m512i __b = (b); \
-  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (p), \
                                           (__mmask16)-1); })
 
 #define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
-  __m512i __a = (a); \
-  __m512i __b = (b); \
-  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (p), \
                                         (__mmask8)-1); })
 
 #define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
-  __m512i __a = (a); \
-  __m512i __b = (b); \
-  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (p), \
                                          (__mmask8)-1); })
 
 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
-  __m512i __a = (a); \
-  __m512i __b = (b); \
-  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (p), \
                                          (__mmask16)(m)); })
 
 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
-  __m512i __a = (a); \
-  __m512i __b = (b); \
-  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (p), \
                                           (__mmask16)(m)); })
 
 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
-  __m512i __a = (a); \
-  __m512i __b = (b); \
-  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (p), \
                                         (__mmask8)(m)); })
 
 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
-  __m512i __a = (a); \
-  __m512i __b = (b); \
-  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (p), \
                                          (__mmask8)(m)); })
+
+#undef __DEFAULT_FN_ATTRS
+
 #endif // __AVX512FINTRIN_H
diff --git a/renderscript/clang-include/avx512vlbwintrin.h b/renderscript/clang-include/avx512vlbwintrin.h
index c3b087e..b4542d6 100644
--- a/renderscript/clang-include/avx512vlbwintrin.h
+++ b/renderscript/clang-include/avx512vlbwintrin.h
@@ -1,4 +1,4 @@
-/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ----------===
+/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -28,585 +28,588 @@
 #ifndef __AVX512VLBWINTRIN_H
 #define __AVX512VLBWINTRIN_H
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw")))
+
 /* Integer compare */
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
                                                    (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
                                                    (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
                                                    (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
                                                  __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
                                                 (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
                                                  __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
                                                    (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
                                                    (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
                                                    (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
                                                    __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
                                                  __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
                                                 (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
                                                  __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
                                                 (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
                                                  __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
                                                  __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
                                                 (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
                                                  (__mmask32)-1);
 }
 
-static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
                                                  __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
                                                 (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
                                                  (__mmask16)-1);
 }
 
-static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
                                                  __u);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
   return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
              (__v32qi) __B,
@@ -614,7 +617,7 @@
              (__mmask32) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
              (__v32qi) __B,
@@ -623,7 +626,7 @@
              (__mmask32) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
              (__v16hi) __B,
@@ -631,7 +634,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
              (__v16hi) __B,
@@ -640,7 +643,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
              (__v32qi) __B,
@@ -648,7 +651,7 @@
              (__mmask32) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
              (__v32qi) __B,
@@ -657,7 +660,7 @@
              (__mmask32) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
              (__v16hi) __B,
@@ -665,7 +668,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
              (__v16hi) __B,
@@ -673,7 +676,7 @@
              _mm256_setzero_si256 (),
              (__mmask16) __U);
 }
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
              (__v16qi) __B,
@@ -681,7 +684,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
              (__v16qi) __B,
@@ -690,7 +693,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
              (__v8hi) __B,
@@ -698,7 +701,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
              (__v8hi) __B,
@@ -707,7 +710,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
              (__v16qi) __B,
@@ -715,7 +718,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
              (__v16qi) __B,
@@ -724,7 +727,7 @@
              (__mmask16) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
              (__v8hi) __B,
@@ -732,7 +735,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
              (__v8hi) __B,
@@ -741,7 +744,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
               (__v16hi) __B,
@@ -749,7 +752,7 @@
               (__mmask16) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
               (__v16hi) __B,
@@ -758,7 +761,7 @@
               (__mmask16) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
               (__v8hi) __B,
@@ -766,7 +769,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
               (__v8hi) __B,
@@ -774,6 +777,1480 @@
               _mm_setzero_si128 (),
               (__mmask8) __U);
 }
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) _mm_setzero_si128 (), __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) __W,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) __W,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+               (__v8hi) __I /* idx */ ,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
+         __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+               (__v16hi) __I /* idx */ ,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+               (__v16qi) __Y,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+               (__v16qi) __Y,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
+         __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+               (__v32qi) __Y,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+               (__v32qi) __Y,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v4si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v4si) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v8si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v8si) _mm256_setzero_si256(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi8 (__m128i __A) {
+
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+               (__v8hi) __Y,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+               (__v8hi) __Y,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+               (__v16hi) __Y,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+               (__v16hi) __Y,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+          __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+       __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+          __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+       __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+      __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+         __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+          __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+      __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+         __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+          __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
 #define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
   (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
                                          (__v16qi)(__m128i)(b), \
@@ -854,4 +2331,6 @@
                                           (__v16hi)(__m256i)(b), \
                                           (p), (__mmask16)(m)); })
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __AVX512VLBWINTRIN_H */
diff --git a/renderscript/clang-include/avx512vldqintrin.h b/renderscript/clang-include/avx512vldqintrin.h
index 4024446..dfd858e 100644
--- a/renderscript/clang-include/avx512vldqintrin.h
+++ b/renderscript/clang-include/avx512vldqintrin.h
@@ -1,4 +1,4 @@
-/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ---------------------------===
+/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -20,7 +20,7 @@
  *
  *===-----------------------------------------------------------------------===
  */
- 
+
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
 #endif
@@ -28,13 +28,15 @@
 #ifndef __AVX512VLDQINTRIN_H
 #define __AVX512VLDQINTRIN_H
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq")))
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
   return (__m256i) ((__v4di) __A * (__v4di) __B);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
               (__v4di) __B,
@@ -42,7 +44,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
               (__v4di) __B,
@@ -51,12 +53,12 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi64 (__m128i __A, __m128i __B) {
   return (__m128i) ((__v2di) __A * (__v2di) __B);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
               (__v2di) __B,
@@ -64,7 +66,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
               (__v2di) __B,
@@ -73,7 +75,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
               (__v4df) __B,
@@ -81,7 +83,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
               (__v4df) __B,
@@ -90,7 +92,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
               (__v2df) __B,
@@ -98,7 +100,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
               (__v2df) __B,
@@ -107,7 +109,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
              (__v8sf) __B,
@@ -115,7 +117,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
              (__v8sf) __B,
@@ -124,7 +126,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
              (__v4sf) __B,
@@ -132,7 +134,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
              (__v4sf) __B,
@@ -141,7 +143,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
              (__v4df) __B,
@@ -149,7 +151,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
              (__v4df) __B,
@@ -158,7 +160,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
              (__v2df) __B,
@@ -166,7 +168,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
              (__v2df) __B,
@@ -175,7 +177,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
             (__v8sf) __B,
@@ -183,7 +185,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
             (__v8sf) __B,
@@ -192,7 +194,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
             (__v4sf) __B,
@@ -200,7 +202,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
             (__v4sf) __B,
@@ -209,7 +211,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
         __m256d __B) {
   return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
@@ -218,7 +220,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
              (__v4df) __B,
@@ -227,7 +229,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
              (__v2df) __B,
@@ -235,7 +237,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
              (__v2df) __B,
@@ -244,7 +246,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
             (__v8sf) __B,
@@ -252,7 +254,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
             (__v8sf) __B,
@@ -261,7 +263,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
             (__v4sf) __B,
@@ -269,7 +271,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
             (__v4sf) __B,
@@ -278,7 +280,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
             (__v4df) __B,
@@ -286,7 +288,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
             (__v4df) __B,
@@ -295,7 +297,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
             (__v2df) __B,
@@ -303,7 +305,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
             (__v2df) __B,
@@ -312,7 +314,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
                  (__v8sf) __B,
@@ -320,7 +322,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
                  (__v8sf) __B,
@@ -329,7 +331,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
                  (__v4sf) __B,
@@ -337,7 +339,7 @@
                  (__mmask8) __U);
 }
 
-static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
                  (__v4sf) __B,
@@ -346,4 +348,606 @@
                  (__mmask8) __U);
 }
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+#define _mm_range_pd(__A, __B, __C) __extension__ ({                         \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1); })
+
+#define _mm_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({          \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) __W, (__mmask8) __U); })
+
+#define _mm_maskz_range_pd(__U, __A, __B, __C) __extension__ ({              \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) __U); })
+
+#define _mm256_range_pd(__A, __B, __C) __extension__ ({                      \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) -1); })
+
+#define _mm256_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({       \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_range_pd(__U, __A, __B, __C) __extension__ ({           \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_range_ps(__A, __B, __C) __extension__ ({                         \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+#define _mm_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({          \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) __W, (__mmask8) __U); })
+
+#define _mm_maskz_range_ps(__U, __A, __B, __C) __extension__ ({              \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_range_ps(__A, __B, __C) __extension__ ({                      \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({       \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_range_ps(__U, __A, __B, __C) __extension__ ({           \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+#define _mm_reduce_pd(__A, __B) __extension__ ({                \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1); })
+
+#define _mm_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) __W, (__mmask8) __U); })
+
+#define _mm_maskz_reduce_pd(__U, __A, __B) __extension__ ({     \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) __U); })
+
+#define _mm256_reduce_pd(__A, __B) __extension__ ({                \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) -1); })
+
+#define _mm256_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_reduce_pd(__U, __A, __B) __extension__ ({     \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_reduce_ps(__A, __B) __extension__ ({                   \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+#define _mm_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({    \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) __W, (__mmask8) __U); })
+
+#define _mm_maskz_reduce_ps(__U, __A, __B) __extension__ ({        \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_reduce_ps(__A, __B) __extension__ ({                \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({ \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_reduce_ps(__U, __A, __B) __extension__ ({     \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+#undef __DEFAULT_FN_ATTRS
+
 #endif
diff --git a/renderscript/clang-include/avx512vlintrin.h b/renderscript/clang-include/avx512vlintrin.h
index 9de0cf4..8f13536 100644
--- a/renderscript/clang-include/avx512vlintrin.h
+++ b/renderscript/clang-include/avx512vlintrin.h
@@ -28,589 +28,589 @@
 #ifndef __AVX512VLINTRIN_H
 #define __AVX512VLINTRIN_H
 
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
+#define __DEFAULT_FN_ATTRS_BOTH __attribute__((__always_inline__, __nodebug__, __target__("avx512vl, avx512bw")))
+
 /* Integer compare */
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
                                                 __u);
 }
 
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
                                                 __u);
 }
 
-
-
-
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
                                                   (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
 _mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
                                                   __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
                                                 __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
                                                (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
                                                __u);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
                                                 (__mmask8)-1);
 }
 
-static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
 _mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
                                                 __u);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B)
 {
@@ -620,7 +620,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
@@ -630,7 +630,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B)
 {
@@ -640,7 +640,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
@@ -650,7 +650,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B)
 {
@@ -660,7 +660,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
@@ -670,7 +670,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B)
 {
@@ -680,7 +680,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
@@ -690,7 +690,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -700,7 +700,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
@@ -710,7 +710,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -720,7 +720,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
@@ -730,7 +730,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -740,7 +740,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
@@ -750,7 +750,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -760,7 +760,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
@@ -770,7 +770,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
            __m256i __Y)
 {
@@ -779,7 +779,7 @@
               (__v4di) __W, __M);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
 {
   return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
@@ -789,7 +789,7 @@
               __M);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
         __m128i __Y)
 {
@@ -798,7 +798,7 @@
               (__v2di) __W, __M);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
 {
   return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
@@ -808,7 +808,7 @@
               __M);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
            __m256i __Y)
 {
@@ -817,7 +817,7 @@
                (__v4di) __W, __M);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
 {
   return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
@@ -827,7 +827,7 @@
                __M);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
         __m128i __Y)
 {
@@ -836,7 +836,7 @@
                (__v2di) __W, __M);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
 {
   return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
@@ -846,7 +846,7 @@
                __M);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
@@ -856,7 +856,7 @@
               __M);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
        __m256i __B)
 {
@@ -865,7 +865,7 @@
               (__v8si) __W, __M);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
@@ -875,7 +875,7 @@
               __M);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
           __m128i __B)
 {
@@ -884,7 +884,7 @@
               (__v4si) __W, __M);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B)
 {
@@ -894,7 +894,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
@@ -904,7 +904,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
@@ -913,7 +913,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
@@ -923,7 +923,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
         __m256i __B)
 {
@@ -933,7 +933,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
@@ -943,7 +943,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
            __m128i __B)
 {
@@ -953,7 +953,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
@@ -963,7 +963,7 @@
               (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
           __m256i __B)
 {
@@ -973,7 +973,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
@@ -983,7 +983,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
@@ -992,7 +992,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
@@ -1002,7 +1002,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B)
 {
@@ -1012,7 +1012,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
@@ -1022,7 +1022,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -1032,7 +1032,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
@@ -1042,7 +1042,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B)
 {
@@ -1051,7 +1051,7 @@
              (__v4di) __W, __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
@@ -1061,7 +1061,7 @@
              __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -1070,7 +1070,7 @@
              (__v2di) __W, __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
@@ -1080,7 +1080,7 @@
              __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
         __m256i __B)
 {
@@ -1089,7 +1089,7 @@
               (__v4di) __W, __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
@@ -1099,7 +1099,7 @@
               __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
            __m128i __B)
 {
@@ -1108,7 +1108,7 @@
               (__v2di) __W, __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
@@ -1118,7 +1118,7 @@
               __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
           __m256i __B)
 {
@@ -1128,7 +1128,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
@@ -1138,7 +1138,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
@@ -1147,7 +1147,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
@@ -1157,7 +1157,7 @@
             (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B)
 {
@@ -1167,7 +1167,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
@@ -1177,7 +1177,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B)
 {
@@ -1187,7 +1187,7 @@
              (__mmask8) __U);
 }
 
-static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
@@ -1316,4 +1316,3291 @@
   (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
                                          (__v2df)(__m128)(b), \
                                          (p), (__mmask8)(m)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    -(__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    -(__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   -(__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       -(__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        -(__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       -(__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        -(__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      -(__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       -(__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
+                         __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      -(__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       -(__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      (__v4df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     (__v8sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
+  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
+  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
+  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
+  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df)
+                  _mm_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df)
+                  _mm256_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf)
+                 _mm256_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
+  __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
+            (__v2df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
+  __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
+            (__v4df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
+            (__v2di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
+            (__v4di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
+  __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
+            (__v4sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
+  __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
+            (__v8sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
+            (__v4si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
+            (__v8si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu32_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_pd (__m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu32_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_ps (__m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+              (__v2df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+              (__v4df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+              (__v2di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+               (__v2di)
+               _mm_setzero_si128 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+              (__v4di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+               (__v4di)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+                   (__v4sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+                   (__v8sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+              (__v4si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+               (__v4si)
+               _mm_setzero_si128 (),
+               (__mmask8)     __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+              (__v8si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_getexp_pd (__m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_getexp_pd (__m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_getexp_ps (__m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_getexp_ps (__m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi64 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi64 (__m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+#define _mm_roundscale_pd(__A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, \
+                   __imm, (__v2df) _mm_setzero_pd (), (__mmask8) -1); })
+
+
+#define _mm_mask_roundscale_pd(__W, __U, __A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, __imm, \
+                   (__v2df) __W, (__mmask8) __U); })
+
+
+#define _mm_maskz_roundscale_pd(__U, __A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, __imm, \
+                   (__v2df) _mm_setzero_pd (), (__mmask8) __U); })
+
+
+#define _mm256_roundscale_pd(__A, __imm) __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) _mm256_setzero_pd (), (__mmask8) -1); })
+
+
+#define _mm256_mask_roundscale_pd(__W, __U, __A, __imm) __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) __W, (__mmask8) __U); })
+
+
+#define _mm256_maskz_roundscale_pd(__U, __A, __imm)  __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_roundscale_ps(__A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+
+#define _mm_mask_roundscale_ps(__W, __U, __A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) __W, (__mmask8) __U); })
+
+
+#define _mm_maskz_roundscale_ps(__U, __A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_roundscale_ps(__A, __imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,__imm, \
+                  (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_roundscale_ps(__W, __U, __A,__imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, __imm, \
+                  (__v8sf) __W, (__mmask8) __U); })
+
+
+#define _mm256_maskz_roundscale_ps(__U, __A, __imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, __imm, \
+                  (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_pd (__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_scalef_pd (__m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ps (__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_scalef_ps (__m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm_i64scatter_pd(__addr,__index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df(__addr, (__mmask8) 0xFF, (__v2di) __index, \
+                              (__v2df) __v1, __scale); })
+
+#define _mm_mask_i64scatter_pd(__addr, __mask, __index, __v1, \
+                               __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index, \
+                               (__v2df) __v1, __scale); })
+
+
+#define _mm_i64scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF, \
+        (__v2di) __index, (__v2di) __v1, __scale); })
+
+#define _mm_mask_i64scatter_epi64(__addr, __mask, __index, __v1,\
+                                  __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index,\
+        (__v2di) __v1, __scale); })
+
+#define _mm256_i64scatter_pd(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF,\
+        (__v4di) __index, (__v4df) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_pd(__addr, __mask, __index, __v1,\
+                                   __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index,\
+        (__v4df) __v1, __scale); })
+
+#define _mm256_i64scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF, (__v4di) __index,\
+                               (__v4di) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_epi64(__addr, __mask, __index, __v1,\
+                                      __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index,\
+        (__v4di) __v1, __scale); })
+
+#define _mm_i64scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF,\
+        (__v2di) __index, (__v4sf) __v1, __scale); })
+
+#define _mm_mask_i64scatter_ps(__addr, __mask, __index, __v1, \
+                                __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index,\
+        (__v4sf) __v1, __scale); })
+
+#define _mm_i64scatter_epi32(__addr, __index, __v1, \
+                              __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF,\
+        (__v2di) __index, (__v4si) __v1, __scale); })
+
+#define _mm_mask_i64scatter_epi32(__addr, __mask, __index, __v1,\
+         __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index,\
+        (__v4si) __v1, __scale); })
+
+#define _mm256_i64scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF, (__v4di) __index, \
+                              (__v4sf) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_ps(__addr, __mask, __index, __v1, \
+                                   __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index, \
+        (__v4sf) __v1, __scale); })
+
+#define _mm256_i64scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF, \
+        (__v4di) __index, (__v4si) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_epi32(__addr, __mask, __index, __v1, \
+                                      __scale) __extension__ ({  \
+  __builtin_ia32_scatterdiv8si(__addr, __mask, (__v4di) __index, \
+        (__v4si) __v1, __scale); })
+
+#define _mm_i32scatter_pd(__addr, __index, __v1,         \
+                          __scale) __extension__ ({      \
+  __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF, \
+        (__v4si) __index, (__v2df) __v1, __scale); })
+
+#define _mm_mask_i32scatter_pd(__addr, __mask, __index, __v1,    \
+                                __scale) __extension__ ({        \
+  __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index,\
+         (__v2df) __v1, __scale); })
+
+#define _mm_i32scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF,                       \
+        (__v4si) __index, (__v2di) __v1, __scale); })
+
+#define _mm_mask_i32scatter_epi64(__addr, __mask, __index, __v1, \
+         __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index, \
+        (__v2di) __v1, __scale); })
+
+#define _mm256_i32scatter_pd(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF,                      \
+        (__v4si) __index, (__v4df) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_pd(__addr, __mask, __index, __v1, \
+         __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index, \
+        (__v4df) __v1, __scale); })
+
+#define _mm256_i32scatter_epi64(__addr, __index, __v1,    \
+                                __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF,  \
+        (__v4si) __index, (__v4di) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_epi64(__addr, __mask, __index, __v1, \
+            __scale) __extension__ ({                               \
+  __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index,   \
+        (__v4di) __v1, __scale); })
+
+#define _mm_i32scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF,                   \
+        (__v4si) __index, (__v4sf) __v1, __scale); })
+
+#define _mm_mask_i32scatter_ps(__addr, __mask, __index, __v1,     \
+                               __scale) __extension__ ({          \
+  __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index, \
+        (__v4sf) __v1, __scale); })
+
+#define _mm_i32scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF,                       \
+        (__v4si) __index, (__v4si) __v1, __scale); })
+
+#define _mm_mask_i32scatter_epi32(__addr, __mask, __index, __v1, \
+                                  __scale) __extension__ ({      \
+  __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index,\
+        (__v4si) __v1, __scale); })
+
+#define _mm256_i32scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF,                      \
+        (__v8si) __index, (__v8sf) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_ps(__addr, __mask, __index, __v1, \
+                                   __scale) __extension__ ({     \
+  __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index,\
+        (__v8sf) __v1, __scale); })
+
+#define _mm256_i32scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF,                         \
+        (__v8si) __index, (__v8si) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_epi32(__addr, __mask, __index, __v1, \
+            __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index,    \
+        (__v8si) __v1, __scale); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+              (__v2df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+              (__v4df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+              (__v4df)
+              _mm256_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+             (__v8sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+             (__v8sf)
+             _mm256_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A,
+              (__v2di) __I
+              /* idx */ ,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A,
+              (__v4di) __I
+              /* idx */ ,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A,
+                   (__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A,
+                   (__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I
+               /* idx */ ,
+               (__v2df) __A,
+               (__v2df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I
+               /* idx */ ,
+               (__v4df) __A,
+               (__v4df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I,
+        __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4sf) __A,
+              (__v4sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8sf) __A,
+              (__v8sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I
+              /* idx */ ,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8)
+              __U);
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I
+              /* idx */ ,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8)
+              __U);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_BOTH
+
 #endif /* __AVX512VLINTRIN_H */
diff --git a/renderscript/clang-include/avxintrin.h b/renderscript/clang-include/avxintrin.h
index 4907965..6d1ca54 100644
--- a/renderscript/clang-include/avxintrin.h
+++ b/renderscript/clang-include/avxintrin.h
@@ -35,126 +35,131 @@
 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
 
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v32qs __attribute__((__vector_size__(32)));
+
 typedef float __m256 __attribute__ ((__vector_size__ (32)));
 typedef double __m256d __attribute__((__vector_size__(32)));
 typedef long long __m256i __attribute__((__vector_size__(32)));
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
+
 /* Arithmetic */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_add_pd(__m256d __a, __m256d __b)
 {
   return __a+__b;
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_add_ps(__m256 __a, __m256 __b)
 {
   return __a+__b;
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_sub_pd(__m256d __a, __m256d __b)
 {
   return __a-__b;
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_sub_ps(__m256 __a, __m256 __b)
 {
   return __a-__b;
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_addsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_addsub_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_div_pd(__m256d __a, __m256d __b)
 {
   return __a / __b;
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_div_ps(__m256 __a, __m256 __b)
 {
   return __a / __b;
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_max_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_max_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_min_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_min_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_mul_pd(__m256d __a, __m256d __b)
 {
   return __a * __b;
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_mul_ps(__m256 __a, __m256 __b)
 {
   return __a * __b;
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_sqrt_pd(__m256d __a)
 {
   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_sqrt_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_rsqrt_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_rcp_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
 }
 
 #define _mm256_round_pd(V, M) __extension__ ({ \
-    __m256d __V = (V); \
-    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
+    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
 
 #define _mm256_round_ps(V, M) __extension__ ({ \
-  __m256 __V = (V); \
-  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
+  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
 
 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
@@ -162,125 +167,125 @@
 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
 
 /* Logical */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_and_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4di)__a & (__v4di)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_and_ps(__m256 __a, __m256 __b)
 {
   return (__m256)((__v8si)__a & (__v8si)__b);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_andnot_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)(~(__v4di)__a & (__v4di)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_andnot_ps(__m256 __a, __m256 __b)
 {
   return (__m256)(~(__v8si)__a & (__v8si)__b);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_or_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4di)__a | (__v4di)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_or_ps(__m256 __a, __m256 __b)
 {
   return (__m256)((__v8si)__a | (__v8si)__b);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_xor_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4di)__a ^ (__v4di)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_xor_ps(__m256 __a, __m256 __b)
 {
   return (__m256)((__v8si)__a ^ (__v8si)__b);
 }
 
 /* Horizontal arithmetic */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_hadd_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_hadd_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_hsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_hsub_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
 /* Vector permutations */
-static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline __m128d __DEFAULT_FN_ATTRS
 _mm_permutevar_pd(__m128d __a, __m128i __c)
 {
   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_permutevar_pd(__m256d __a, __m256i __c)
 {
   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
 }
 
-static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_permutevar_ps(__m128 __a, __m128i __c)
 {
   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_permutevar_ps(__m256 __a, __m256i __c)
 {
   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
 }
 
 #define _mm_permute_pd(A, C) __extension__ ({ \
-  __m128d __A = (A); \
-  (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
+                                   (__v2df)_mm_setzero_pd(), \
                                    (C) & 0x1, ((C) & 0x2) >> 1); })
 
 #define _mm256_permute_pd(A, C) __extension__ ({ \
-  __m256d __A = (A); \
-  (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
+                                   (__v4df)_mm256_setzero_pd(), \
                                    (C) & 0x1, ((C) & 0x2) >> 1, \
                                    2 + (((C) & 0x4) >> 2), \
                                    2 + (((C) & 0x8) >> 3)); })
 
 #define _mm_permute_ps(A, C) __extension__ ({ \
-  __m128 __A = (A); \
-  (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
+                                  (__v4sf)_mm_setzero_ps(), \
                                    (C) & 0x3, ((C) & 0xc) >> 2, \
                                    ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
 
 #define _mm256_permute_ps(A, C) __extension__ ({ \
-  __m256 __A = (A); \
-  (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
+                                  (__v8sf)_mm256_setzero_ps(), \
                                   (C) & 0x3, ((C) & 0xc) >> 2, \
                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
                                   4 + (((C) & 0x03) >> 0), \
@@ -289,34 +294,29 @@
                                   4 + (((C) & 0xc0) >> 6)); })
 
 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
-  __m256d __V1 = (V1); \
-  __m256d __V2 = (V2); \
-  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
+  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+                                           (__v4df)(__m256d)(V2), (M)); })
 
 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
-  __m256 __V1 = (V1); \
-  __m256 __V2 = (V2); \
-  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                          (__v8sf)(__m256)(V2), (M)); })
 
 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
-  __m256i __V1 = (V1); \
-  __m256i __V2 = (V2); \
-  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                           (__v8si)(__m256i)(V2), (M)); })
 
 /* Vector Blend */
 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
-  __m256d __V1 = (V1); \
-  __m256d __V2 = (V2); \
-  (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
+                                   (__v4df)(__m256d)(V2), \
                                    (((M) & 0x01) ? 4 : 0), \
                                    (((M) & 0x02) ? 5 : 1), \
                                    (((M) & 0x04) ? 6 : 2), \
                                    (((M) & 0x08) ? 7 : 3)); })
 
 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
-  __m256 __V1 = (V1); \
-  __m256 __V2 = (V2); \
-  (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
+                                  (__v8sf)(__m256)(V2), \
                                   (((M) & 0x01) ?  8 : 0), \
                                   (((M) & 0x02) ?  9 : 1), \
                                   (((M) & 0x04) ? 10 : 2), \
@@ -326,14 +326,14 @@
                                   (((M) & 0x40) ? 14 : 6), \
                                   (((M) & 0x80) ? 15 : 7)); })
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
 {
   return (__m256d)__builtin_ia32_blendvpd256(
     (__v4df)__a, (__v4df)__b, (__v4df)__c);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 {
   return (__m256)__builtin_ia32_blendvps256(
@@ -342,28 +342,29 @@
 
 /* Vector Dot Product */
 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
-  __m256 __V1 = (V1); \
-  __m256 __V2 = (V2); \
-  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
+  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+                                 (__v8sf)(__m256)(V2), (M)); })
 
 /* Vector shuffle */
 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
-        __m256 __a = (a); \
-        __m256 __b = (b); \
-        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
-        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
-        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
-        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
-        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
+        (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
+                                        (__v8sf)(__m256)(b), \
+                                        (mask) & 0x3, \
+                                        ((mask) & 0xc) >> 2, \
+                                        (((mask) & 0x30) >> 4) + 8, \
+                                        (((mask) & 0xc0) >> 6) + 8, \
+                                        ((mask) & 0x3) + 4, \
+                                        (((mask) & 0xc) >> 2) + 4, \
+                                        (((mask) & 0x30) >> 4) + 12, \
+                                        (((mask) & 0xc0) >> 6) + 12); })
 
 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
-        __m256d __a = (a); \
-        __m256d __b = (b); \
-        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
-        (mask) & 0x1, \
-        (((mask) & 0x2) >> 1) + 4, \
-        (((mask) & 0x4) >> 2) + 2, \
-        (((mask) & 0x8) >> 3) + 6); })
+        (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), \
+                                         (mask) & 0x1, \
+                                         (((mask) & 0x2) >> 1) + 4, \
+                                         (((mask) & 0x4) >> 2) + 2, \
+                                         (((mask) & 0x8) >> 3) + 6); })
 
 /* Compare */
 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
@@ -400,50 +401,44 @@
 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
 
 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
-  __m128d __a = (a); \
-  __m128d __b = (b); \
-  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
+  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
 
 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
-  __m128 __a = (a); \
-  __m128 __b = (b); \
-  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
+  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
 
 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
-  __m256d __a = (a); \
-  __m256d __b = (b); \
-  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
+  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), (c)); })
 
 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
-  __m256 __a = (a); \
-  __m256 __b = (b); \
-  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
+  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), (c)); })
 
 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
-  __m128d __a = (a); \
-  __m128d __b = (b); \
-  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
+  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
 
 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
-  __m128 __a = (a); \
-  __m128 __b = (b); \
-  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
+  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi32(__m256i __a, const int __imm)
 {
   __v8si __b = (__v8si)__a;
   return __b[__imm & 7];
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi16(__m256i __a, const int __imm)
 {
   __v16hi __b = (__v16hi)__a;
   return __b[__imm & 15];
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi8(__m256i __a, const int __imm)
 {
   __v32qi __b = (__v32qi)__a;
@@ -451,7 +446,7 @@
 }
 
 #ifdef __x86_64__
-static __inline long long  __attribute__((__always_inline__, __nodebug__))
+static __inline long long  __DEFAULT_FN_ATTRS
 _mm256_extract_epi64(__m256i __a, const int __imm)
 {
   __v4di __b = (__v4di)__a;
@@ -459,7 +454,7 @@
 }
 #endif
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
 {
   __v8si __c = (__v8si)__a;
@@ -467,7 +462,7 @@
   return (__m256i)__c;
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
 {
   __v16hi __c = (__v16hi)__a;
@@ -475,7 +470,7 @@
   return (__m256i)__c;
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
 {
   __v32qi __c = (__v32qi)__a;
@@ -484,7 +479,7 @@
 }
 
 #ifdef __x86_64__
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
 {
   __v4di __c = (__v4di)__a;
@@ -494,263 +489,263 @@
 #endif
 
 /* Conversion */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_pd(__m128i __a)
 {
   return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_ps(__m256i __a)
 {
   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
 }
 
-static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline __m128 __DEFAULT_FN_ATTRS
 _mm256_cvtpd_ps(__m256d __a)
 {
   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtps_epi32(__m256 __a)
 {
   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtps_pd(__m128 __a)
 {
   return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
 }
 
-static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_cvttpd_epi32(__m256d __a)
 {
   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
 }
 
-static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_cvtpd_epi32(__m256d __a)
 {
   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_cvttps_epi32(__m256 __a)
 {
   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
 }
 
 /* Vector replicate */
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_movehdup_ps(__m256 __a)
 {
   return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_moveldup_ps(__m256 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_movedup_pd(__m256d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
 }
 
 /* Unpack and Interleave */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
 {
   return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
 {
   return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
 }
 
 /* Bit Test */
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm_testz_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm_testc_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm_testnzc_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm_testz_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm_testc_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm_testnzc_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
 }
 
 /* Vector extract sign mask */
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_movemask_pd(__m256d __a)
 {
   return __builtin_ia32_movmskpd256((__v4df)__a);
 }
 
-static __inline int __attribute__((__always_inline__, __nodebug__))
+static __inline int __DEFAULT_FN_ATTRS
 _mm256_movemask_ps(__m256 __a)
 {
   return __builtin_ia32_movmskps256((__v8sf)__a);
 }
 
 /* Vector __zero */
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_zeroall(void)
 {
   __builtin_ia32_vzeroall();
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_zeroupper(void)
 {
   __builtin_ia32_vzeroupper();
 }
 
 /* Vector load with broadcast */
-static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_broadcast_ss(float const *__a)
 {
   float __f = *__a;
   return (__m128)(__v4sf){ __f, __f, __f, __f };
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_sd(double const *__a)
 {
   double __d = *__a;
   return (__m256d)(__v4df){ __d, __d, __d, __d };
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ss(float const *__a)
 {
   float __f = *__a;
   return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_pd(__m128d const *__a)
 {
   return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ps(__m128 const *__a)
 {
   return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
 }
 
 /* SIMD load ops */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_load_pd(double const *__p)
 {
   return *(__m256d *)__p;
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_load_ps(float const *__p)
 {
   return *(__m256 *)__p;
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu_pd(double const *__p)
 {
   struct __loadu_pd {
@@ -759,7 +754,7 @@
   return ((struct __loadu_pd*)__p)->__v;
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu_ps(float const *__p)
 {
   struct __loadu_ps {
@@ -768,13 +763,13 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_load_si256(__m256i const *__p)
 {
   return *__p;
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_loadu_si256(__m256i const *__p)
 {
   struct __loadu_si256 {
@@ -783,141 +778,159 @@
   return ((struct __loadu_si256*)__p)->__v;
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_lddqu_si256(__m256i const *__p)
 {
   return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
 }
 
 /* SIMD store ops */
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_pd(double *__p, __m256d __a)
 {
   *(__m256d *)__p = __a;
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_ps(float *__p, __m256 __a)
 {
   *(__m256 *)__p = __a;
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_pd(double *__p, __m256d __a)
 {
   __builtin_ia32_storeupd256(__p, (__v4df)__a);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_ps(float *__p, __m256 __a)
 {
   __builtin_ia32_storeups256(__p, (__v8sf)__a);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_si256(__m256i *__p, __m256i __a)
 {
   *__p = __a;
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_si256(__m256i *__p, __m256i __a)
 {
   __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
 }
 
 /* Conditional load ops */
-static __inline __m128d __attribute__((__always_inline__, __nodebug__))
-_mm_maskload_pd(double const *__p, __m128d __m)
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_maskload_pd(double const *__p, __m128i __m)
 {
-  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m);
+  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
-_mm256_maskload_pd(double const *__p, __m256d __m)
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_maskload_pd(double const *__p, __m256i __m)
 {
   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
-                                               (__v4df)__m);
+                                               (__v4di)__m);
 }
 
-static __inline __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_maskload_ps(float const *__p, __m128 __m)
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_maskload_ps(float const *__p, __m128i __m)
 {
-  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m);
+  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
-_mm256_maskload_ps(float const *__p, __m256 __m)
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_maskload_ps(float const *__p, __m256i __m)
 {
-  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m);
+  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
 }
 
 /* Conditional store ops */
-static __inline void __attribute__((__always_inline__, __nodebug__))
-_mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a)
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
 {
-  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a);
+  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
-_mm_maskstore_pd(double *__p, __m128d __m, __m128d __a)
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
 {
-  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a);
+  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
-_mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a)
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
 {
-  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a);
+  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
-_mm_maskstore_ps(float *__p, __m128 __m, __m128 __a)
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 {
-  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a);
+  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
 }
 
 /* Cacheability support ops */
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_si256(__m256i *__a, __m256i __b)
 {
   __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_pd(double *__a, __m256d __b)
 {
   __builtin_ia32_movntpd256(__a, (__v4df)__b);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_ps(float *__p, __m256 __a)
 {
   __builtin_ia32_movntps256(__p, (__v8sf)__a);
 }
 
 /* Create vectors */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_undefined_pd()
+{
+  return (__m256d)__builtin_ia32_undef256();
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_undefined_ps()
+{
+  return (__m256)__builtin_ia32_undef256();
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_undefined_si256()
+{
+  return (__m256i)__builtin_ia32_undef256();
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set_pd(double __a, double __b, double __c, double __d)
 {
   return (__m256d){ __d, __c, __b, __a };
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set_ps(float __a, float __b, float __c, float __d,
               float __e, float __f, float __g, float __h)
 {
   return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
                  int __i4, int __i5, int __i6, int __i7)
 {
   return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
                  short __w11, short __w10, short __w09, short __w08,
                  short __w07, short __w06, short __w05, short __w04,
@@ -927,7 +940,7 @@
     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
                 char __b27, char __b26, char __b25, char __b24,
                 char __b23, char __b22, char __b21, char __b20,
@@ -945,34 +958,34 @@
   };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
   return (__m256i)(__v4di){ __d, __c, __b, __a };
 }
 
 /* Create vectors with elements in reverse order */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setr_pd(double __a, double __b, double __c, double __d)
 {
   return (__m256d){ __a, __b, __c, __d };
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setr_ps(float __a, float __b, float __c, float __d,
                float __e, float __f, float __g, float __h)
 {
   return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
                   int __i4, int __i5, int __i6, int __i7)
 {
   return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
        short __w11, short __w10, short __w09, short __w08,
        short __w07, short __w06, short __w05, short __w04,
@@ -982,7 +995,7 @@
     __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
                  char __b27, char __b26, char __b25, char __b24,
                  char __b23, char __b22, char __b21, char __b20,
@@ -999,39 +1012,39 @@
     __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
   return (__m256i)(__v4di){ __a, __b, __c, __d };
 }
 
 /* Create vectors with repeated elements */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set1_pd(double __w)
 {
   return (__m256d){ __w, __w, __w, __w };
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set1_ps(float __w)
 {
   return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi32(int __i)
 {
   return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi16(short __w)
 {
   return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
     __w, __w, __w, __w, __w, __w };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi8(char __b)
 {
   return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
@@ -1039,112 +1052,112 @@
     __b, __b, __b, __b, __b, __b, __b };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi64x(long long __q)
 {
   return (__m256i)(__v4di){ __q, __q, __q, __q };
 }
 
 /* Create __zeroed vectors */
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setzero_pd(void)
 {
   return (__m256d){ 0, 0, 0, 0 };
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setzero_ps(void)
 {
   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setzero_si256(void)
 {
   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
 }
 
 /* Cast between vector types */
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castpd_ps(__m256d __a)
 {
   return (__m256)__a;
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castpd_si256(__m256d __a)
 {
   return (__m256i)__a;
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castps_pd(__m256 __a)
 {
   return (__m256d)__a;
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castps_si256(__m256 __a)
 {
   return (__m256i)__a;
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castsi256_ps(__m256i __a)
 {
   return (__m256)__a;
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castsi256_pd(__m256i __a)
 {
   return (__m256d)__a;
 }
 
-static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline __m128d __DEFAULT_FN_ATTRS
 _mm256_castpd256_pd128(__m256d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1);
 }
 
-static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline __m128 __DEFAULT_FN_ATTRS
 _mm256_castps256_ps128(__m256 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
 }
 
-static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_castsi256_si128(__m256i __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castpd128_pd256(__m128d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castps128_ps256(__m128 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castsi128_si256(__m128i __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
 }
 
-/* 
+/*
    Vector insert.
    We use macros rather than inlines because we only want to accept
    invocations where the immediate M is a constant expression.
 */
 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_shufflevector( \
-    (__v8sf)(V1), \
+    (__v8sf)(__m256)(V1), \
     (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
     (((M) & 1) ?  0 :  8), \
     (((M) & 1) ?  1 :  9), \
@@ -1157,7 +1170,7 @@
 
 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_shufflevector( \
-    (__v4df)(V1), \
+    (__v4df)(__m256d)(V1), \
     (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
     (((M) & 1) ? 0 : 4), \
     (((M) & 1) ? 1 : 5), \
@@ -1166,21 +1179,21 @@
 
 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
   (__m256i)__builtin_shufflevector( \
-    (__v4di)(V1), \
+    (__v4di)(__m256i)(V1), \
     (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
     (((M) & 1) ? 0 : 4), \
     (((M) & 1) ? 1 : 5), \
     (((M) & 1) ? 4 : 2), \
     (((M) & 1) ? 5 : 3) );})
 
-/* 
+/*
    Vector extract.
    We use macros rather than inlines because we only want to accept
    invocations where the immediate M is a constant expression.
 */
 #define _mm256_extractf128_ps(V, M) __extension__ ({ \
   (__m128)__builtin_shufflevector( \
-    (__v8sf)(V), \
+    (__v8sf)(__m256)(V), \
     (__v8sf)(_mm256_setzero_ps()), \
     (((M) & 1) ? 4 : 0), \
     (((M) & 1) ? 5 : 1), \
@@ -1189,20 +1202,20 @@
 
 #define _mm256_extractf128_pd(V, M) __extension__ ({ \
   (__m128d)__builtin_shufflevector( \
-    (__v4df)(V), \
+    (__v4df)(__m256d)(V), \
     (__v4df)(_mm256_setzero_pd()), \
     (((M) & 1) ? 2 : 0), \
     (((M) & 1) ? 3 : 1) );})
 
 #define _mm256_extractf128_si256(V, M) __extension__ ({ \
   (__m128i)__builtin_shufflevector( \
-    (__v4di)(V), \
+    (__v4di)(__m256i)(V), \
     (__v4di)(_mm256_setzero_si256()), \
     (((M) & 1) ? 2 : 0), \
     (((M) & 1) ? 3 : 1) );})
 
 /* SIMD load ops (unaligned) */
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
 {
   struct __loadu_ps {
@@ -1213,18 +1226,18 @@
   return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
 {
   struct __loadu_pd {
     __m128d __v;
   } __attribute__((__packed__, __may_alias__));
-  
+
   __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
   return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
 {
   struct __loadu_si128 {
@@ -1237,7 +1250,7 @@
 }
 
 /* SIMD store ops (unaligned) */
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
 {
   __m128 __v128;
@@ -1248,7 +1261,7 @@
   __builtin_ia32_storeups(__addr_hi, __v128);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
 {
   __m128d __v128;
@@ -1259,7 +1272,7 @@
   __builtin_ia32_storeupd(__addr_hi, __v128);
 }
 
-static __inline void __attribute__((__always_inline__, __nodebug__))
+static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
 {
   __m128i __v128;
@@ -1270,34 +1283,36 @@
   __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set_m128 (__m128 __hi, __m128 __lo) {
   return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set_m128d (__m128d __hi, __m128d __lo) {
   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_m128i (__m128i __hi, __m128i __lo) {
   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setr_m128 (__m128 __lo, __m128 __hi) {
   return _mm256_set_m128(__hi, __lo);
 }
 
-static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setr_m128d (__m128d __lo, __m128d __hi) {
   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
 }
 
-static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_m128i (__m128i __lo, __m128i __hi) {
   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
 }
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __AVXINTRIN_H */
diff --git a/renderscript/clang-include/bmi2intrin.h b/renderscript/clang-include/bmi2intrin.h
index a05cfad..fdae82c 100644
--- a/renderscript/clang-include/bmi2intrin.h
+++ b/renderscript/clang-include/bmi2intrin.h
@@ -25,26 +25,25 @@
 #error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
 #endif
 
-#ifndef __BMI2__
-# error "BMI2 instruction set not enabled"
-#endif /* __BMI2__ */
-
 #ifndef __BMI2INTRIN_H
 #define __BMI2INTRIN_H
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _bzhi_u32(unsigned int __X, unsigned int __Y)
 {
   return __builtin_ia32_bzhi_si(__X, __Y);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _pdep_u32(unsigned int __X, unsigned int __Y)
 {
   return __builtin_ia32_pdep_si(__X, __Y);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _pext_u32(unsigned int __X, unsigned int __Y)
 {
   return __builtin_ia32_pext_si(__X, __Y);
@@ -52,25 +51,25 @@
 
 #ifdef  __x86_64__
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _bzhi_u64(unsigned long long __X, unsigned long long __Y)
 {
   return __builtin_ia32_bzhi_di(__X, __Y);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _pdep_u64(unsigned long long __X, unsigned long long __Y)
 {
   return __builtin_ia32_pdep_di(__X, __Y);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _pext_u64(unsigned long long __X, unsigned long long __Y)
 {
   return __builtin_ia32_pext_di(__X, __Y);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _mulx_u64 (unsigned long long __X, unsigned long long __Y,
 	   unsigned long long *__P)
 {
@@ -81,7 +80,7 @@
 
 #else /* !__x86_64__ */
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
 {
   unsigned long long __res = (unsigned long long) __X * __Y;
@@ -91,4 +90,6 @@
 
 #endif /* !__x86_64__  */
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __BMI2INTRIN_H */
diff --git a/renderscript/clang-include/bmiintrin.h b/renderscript/clang-include/bmiintrin.h
index 0e5fd55..da98792 100644
--- a/renderscript/clang-include/bmiintrin.h
+++ b/renderscript/clang-include/bmiintrin.h
@@ -25,10 +25,6 @@
 #error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
 #endif
 
-#ifndef __BMI__
-# error "BMI instruction set not enabled"
-#endif /* __BMI__ */
-
 #ifndef __BMIINTRIN_H
 #define __BMIINTRIN_H
 
@@ -40,51 +36,59 @@
 #define _blsr_u32(a)      (__blsr_u32((a)))
 #define _tzcnt_u32(a)     (__tzcnt_u32((a)))
 
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
+   instruction behaves as BSF on non-BMI targets, there is code that expects
+   to use it as a potentially faster version of BSF. */
+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ unsigned short __RELAXED_FN_ATTRS
 __tzcnt_u16(unsigned short __X)
 {
   return __X ? __builtin_ctzs(__X) : 16;
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __andn_u32(unsigned int __X, unsigned int __Y)
 {
   return ~__X & __Y;
 }
 
 /* AMD-specified, double-leading-underscore version of BEXTR */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __bextr_u32(unsigned int __X, unsigned int __Y)
 {
   return __builtin_ia32_bextr_u32(__X, __Y);
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 {
   return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsi_u32(unsigned int __X)
 {
   return __X & -__X;
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsmsk_u32(unsigned int __X)
 {
   return __X ^ (__X - 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsr_u32(unsigned int __X)
 {
   return __X & (__X - 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
   return __X ? __builtin_ctz(__X) : 32;
@@ -99,45 +103,45 @@
 #define _blsr_u64(a)      (__blsr_u64((a)))
 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __andn_u64 (unsigned long long __X, unsigned long long __Y)
 {
   return ~__X & __Y;
 }
 
 /* AMD-specified, double-leading-underscore version of BEXTR */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __bextr_u64(unsigned long long __X, unsigned long long __Y)
 {
   return __builtin_ia32_bextr_u64(__X, __Y);
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 {
   return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsi_u64(unsigned long long __X)
 {
   return __X & -__X;
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsmsk_u64(unsigned long long __X)
 {
   return __X ^ (__X - 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsr_u64(unsigned long long __X)
 {
   return __X & (__X - 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
   return __X ? __builtin_ctzll(__X) : 64;
@@ -145,4 +149,7 @@
 
 #endif /* __x86_64__ */
 
+#undef __DEFAULT_FN_ATTRS
+#undef __RELAXED_FN_ATTRS
+
 #endif /* __BMIINTRIN_H */
diff --git a/renderscript/clang-include/emmintrin.h b/renderscript/clang-include/emmintrin.h
index c764d68..cfc2c71 100644
--- a/renderscript/clang-include/emmintrin.h
+++ b/renderscript/clang-include/emmintrin.h
@@ -24,10 +24,6 @@
 #ifndef __EMMINTRIN_H
 #define __EMMINTRIN_H
 
-#ifndef __SSE2__
-#error "SSE2 instruction set not enabled"
-#else
-
 #include <xmmintrin.h>
 
 typedef double __m128d __attribute__((__vector_size__(16)));
@@ -39,433 +35,442 @@
 typedef short __v8hi __attribute__((__vector_size__(16)));
 typedef char __v16qi __attribute__((__vector_size__(16)));
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v16qs __attribute__((__vector_size__(16)));
+
+#include <f16cintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_sd(__m128d __a, __m128d __b)
 {
   __a[0] += __b[0];
   return __a;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_pd(__m128d __a, __m128d __b)
 {
   return __a + __b;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_sd(__m128d __a, __m128d __b)
 {
   __a[0] -= __b[0];
   return __a;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_pd(__m128d __a, __m128d __b)
 {
   return __a - __b;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_sd(__m128d __a, __m128d __b)
 {
   __a[0] *= __b[0];
   return __a;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_pd(__m128d __a, __m128d __b)
 {
   return __a * __b;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_sd(__m128d __a, __m128d __b)
 {
   __a[0] /= __b[0];
   return __a;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_pd(__m128d __a, __m128d __b)
 {
   return __a / __b;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_sqrtsd(__b);
   return (__m128d) { __c[0], __a[1] };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_pd(__m128d __a)
 {
   return __builtin_ia32_sqrtpd(__a);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_minsd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_minpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_maxsd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_maxpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_and_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v4si)__a & (__v4si)__b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_andnot_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)(~(__v4si)__a & (__v4si)__b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_or_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v4si)__a | (__v4si)__b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_xor_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v4si)__a ^ (__v4si)__b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplepd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplepd(__b, __a);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplesd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
   return (__m128d) { __c[0], __a[1] };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_cmplesd(__b, __a);
   return (__m128d) { __c[0], __a[1] };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
   return (__m128d) { __c[0], __a[1] };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_sd(__m128d __a, __m128d __b)
 {
   __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
   return (__m128d) { __c[0], __a[1] };
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdeq(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdlt(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdle(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdgt(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdge(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdneq(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdeq(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdlt(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdle(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdgt(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdge(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdneq(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpd_ps(__m128d __a)
 {
   return __builtin_ia32_cvtpd2ps(__a);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtps_pd(__m128 __a)
 {
   return __builtin_ia32_cvtps2pd(__a);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtepi32_pd(__m128i __a)
 {
   return __builtin_ia32_cvtdq2pd((__v4si)__a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtpd_epi32(__m128d __a)
 {
   return __builtin_ia32_cvtpd2dq(__a);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsd_si32(__m128d __a)
 {
   return __builtin_ia32_cvtsd2si(__a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsd_ss(__m128 __a, __m128d __b)
 {
   __a[0] = __b[0];
   return __a;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtsi32_sd(__m128d __a, int __b)
 {
   __a[0] = __b;
   return __a;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtss_sd(__m128d __a, __m128 __b)
 {
   __a[0] = __b[0];
   return __a;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttpd_epi32(__m128d __a)
 {
   return (__m128i)__builtin_ia32_cvttpd2dq(__a);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttsd_si32(__m128d __a)
 {
   return __a[0];
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtpd_pi32(__m128d __a)
 {
   return (__m64)__builtin_ia32_cvtpd2pi(__a);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttpd_pi32(__m128d __a)
 {
   return (__m64)__builtin_ia32_cvttpd2pi(__a);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtpi32_pd(__m64 __a)
 {
   return __builtin_ia32_cvtpi2pd((__v2si)__a);
 }
 
-static __inline__ double __attribute__((__always_inline__, __nodebug__))
+static __inline__ double __DEFAULT_FN_ATTRS
 _mm_cvtsd_f64(__m128d __a)
 {
   return __a[0];
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_pd(double const *__dp)
 {
   return *(__m128d*)__dp;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load1_pd(double const *__dp)
 {
   struct __mm_load1_pd_struct {
@@ -477,14 +482,14 @@
 
 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadr_pd(double const *__dp)
 {
   __m128d __u = *(__m128d*)__dp;
   return __builtin_shufflevector(__u, __u, 1, 0);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadu_pd(double const *__dp)
 {
   struct __loadu_pd {
@@ -493,7 +498,7 @@
   return ((struct __loadu_pd*)__dp)->__v;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_sd(double const *__dp)
 {
   struct __mm_load_sd_struct {
@@ -503,7 +508,7 @@
   return (__m128d){ __u, 0 };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadh_pd(__m128d __a, double const *__dp)
 {
   struct __mm_loadh_pd_struct {
@@ -513,7 +518,7 @@
   return (__m128d){ __a[0], __u };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadl_pd(__m128d __a, double const *__dp)
 {
   struct __mm_loadl_pd_struct {
@@ -523,43 +528,49 @@
   return (__m128d){ __u, __a[1] };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_undefined_pd()
+{
+  return (__m128d)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set_sd(double __w)
 {
   return (__m128d){ __w, 0 };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set1_pd(double __w)
 {
   return (__m128d){ __w, __w };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set_pd(double __w, double __x)
 {
   return (__m128d){ __x, __w };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_setr_pd(double __w, double __x)
 {
   return (__m128d){ __w, __x };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_setzero_pd(void)
 {
   return (__m128d){ 0, 0 };
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_move_sd(__m128d __a, __m128d __b)
 {
   return (__m128d){ __b[0], __a[1] };
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_sd(double *__dp, __m128d __a)
 {
   struct __mm_store_sd_struct {
@@ -568,7 +579,7 @@
   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_pd(double *__dp, __m128d __a)
 {
   struct __mm_store1_pd_struct {
@@ -578,26 +589,26 @@
   ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_pd(double *__dp, __m128d __a)
 {
   *(__m128d *)__dp = __a;
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_pd(double *__dp, __m128d __a)
 {
   __builtin_ia32_storeupd(__dp, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storer_pd(double *__dp, __m128d __a)
 {
   __a = __builtin_shufflevector(__a, __a, 1, 0);
   *(__m128d *)__dp = __a;
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeh_pd(double *__dp, __m128d __a)
 {
   struct __mm_storeh_pd_struct {
@@ -606,7 +617,7 @@
   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_pd(double *__dp, __m128d __a)
 {
   struct __mm_storeh_pd_struct {
@@ -615,211 +626,211 @@
   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a + (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a + (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a + (__v4si)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_si64(__m64 __a, __m64 __b)
 {
-  return __a + __b;
+  return (__m64)__builtin_ia32_paddq(__a, __b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi64(__m128i __a, __m128i __b)
 {
   return __a + __b;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_madd_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhi_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a * (__v8hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mul_su32(__m64 __a, __m64 __b)
 {
   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mul_epu32(__m128i __a, __m128i __b)
 {
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sad_epu8(__m128i __a, __m128i __b)
 {
   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a - (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a - (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a - (__v4si)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_si64(__m64 __a, __m64 __b)
 {
-  return __a - __b;
+  return (__m64)__builtin_ia32_psubq(__a, __b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi64(__m128i __a, __m128i __b)
 {
   return __a - __b;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_and_si128(__m128i __a, __m128i __b)
 {
   return __a & __b;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_andnot_si128(__m128i __a, __m128i __b)
 {
   return ~__a & __b;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_or_si128(__m128i __a, __m128i __b)
 {
   return __a | __b;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_xor_si128(__m128i __a, __m128i __b)
 {
   return __a ^ __b;
@@ -848,61 +859,61 @@
 #define _mm_bslli_si128(a, imm) \
   _mm_slli_si128((a), (imm))
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi64(__m128i __a, int __count)
 {
   return __builtin_ia32_psllqi128(__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi64(__m128i __a, __m128i __count)
 {
   return __builtin_ia32_psllq128(__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
@@ -931,153 +942,152 @@
 #define _mm_bsrli_si128(a, imm) \
   _mm_srli_si128((a), (imm))
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi64(__m128i __a, int __count)
 {
   return __builtin_ia32_psrlqi128(__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi64(__m128i __a, __m128i __count)
 {
   return __builtin_ia32_psrlq128(__a, __count);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a == (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a == (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a == (__v4si)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
 {
   /* This function always performs a signed comparison, but __v16qi is a char
-     which may be signed or unsigned. */
-  typedef signed char __v16qs __attribute__((__vector_size__(16)));
+     which may be signed or unsigned, so use __v16qs. */
   return (__m128i)((__v16qs)__a > (__v16qs)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a > (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a > (__v4si)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi8(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi8(__b, __a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi16(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi16(__b, __a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi32(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi32(__b, __a);
 }
 
 #ifdef __x86_64__
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtsi64_sd(__m128d __a, long long __b)
 {
   __a[0] = __b;
   return __a;
 }
 
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtsd_si64(__m128d __a)
 {
   return __builtin_ia32_cvtsd2si64(__a);
 }
 
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttsd_si64(__m128d __a)
 {
   return __a[0];
 }
 #endif
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtepi32_ps(__m128i __a)
 {
   return __builtin_ia32_cvtdq2ps((__v4si)__a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtps_epi32(__m128 __a)
 {
   return (__m128i)__builtin_ia32_cvtps2dq(__a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttps_epi32(__m128 __a)
 {
   return (__m128i)__builtin_ia32_cvttps2dq(__a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi32_si128(int __a)
 {
   return (__m128i)(__v4si){ __a, 0, 0, 0 };
 }
 
 #ifdef __x86_64__
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si128(long long __a)
 {
   return (__m128i){ __a, 0 };
 }
 #endif
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsi128_si32(__m128i __a)
 {
   __v4si __b = (__v4si)__a;
@@ -1085,20 +1095,20 @@
 }
 
 #ifdef __x86_64__
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtsi128_si64(__m128i __a)
 {
   return __a[0];
 }
 #endif
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_load_si128(__m128i const *__p)
 {
   return *__p;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadu_si128(__m128i const *__p)
 {
   struct __loadu_si128 {
@@ -1107,7 +1117,7 @@
   return ((struct __loadu_si128*)__p)->__v;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadl_epi64(__m128i const *__p)
 {
   struct __mm_loadl_epi64_struct {
@@ -1116,115 +1126,121 @@
   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_set_epi64x(long long q1, long long q0)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_undefined_si128()
 {
-  return (__m128i){ q0, q1 };
+  return (__m128i)__builtin_ia32_undef128();
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_set_epi64(__m64 q1, __m64 q0)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64x(long long __q1, long long __q0)
 {
-  return (__m128i){ (long long)q0, (long long)q1 };
+  return (__m128i){ __q0, __q1 };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_set_epi32(int i3, int i2, int i1, int i0)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64(__m64 __q1, __m64 __q0)
 {
-  return (__m128i)(__v4si){ i0, i1, i2, i3};
+  return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
 {
-  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
 {
-  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64x(long long __q)
 {
   return (__m128i){ __q, __q };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64(__m64 __q)
 {
   return (__m128i){ (long long)__q, (long long)__q };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi32(int __i)
 {
   return (__m128i)(__v4si){ __i, __i, __i, __i };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi16(short __w)
 {
   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi8(char __b)
 {
   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_setr_epi64(__m64 q0, __m64 q1)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi64(__m64 __q0, __m64 __q1)
 {
-  return (__m128i){ (long long)q0, (long long)q1 };
+  return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_setr_epi32(int i0, int i1, int i2, int i3)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
 {
-  return (__m128i)(__v4si){ i0, i1, i2, i3};
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
 {
-  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
 {
-  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setzero_si128(void)
 {
   return (__m128i){ 0LL, 0LL };
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_si128(__m128i *__p, __m128i __b)
 {
   *__p = __b;
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_si128(__m128i *__p, __m128i __b)
 {
   __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
 {
   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_epi64(__m128i *__p, __m128i __a)
 {
   struct __mm_storel_epi64_struct {
@@ -1233,76 +1249,76 @@
   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pd(double *__p, __m128d __a)
 {
   __builtin_ia32_movntpd(__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si128(__m128i *__p, __m128i __a)
 {
   __builtin_ia32_movntdq(__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si32(int *__p, int __a)
 {
   __builtin_ia32_movnti(__p, __a);
 }
 
 #ifdef __x86_64__
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si64(long long *__p, long long __a)
 {
   __builtin_ia32_movnti64(__p, __a);
 }
 #endif
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_clflush(void const *__p)
 {
   __builtin_ia32_clflush(__p);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_lfence(void)
 {
   __builtin_ia32_lfence();
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mfence(void)
 {
   __builtin_ia32_mfence();
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_extract_epi16(__m128i __a, int __imm)
 {
   __v8hi __b = (__v8hi)__a;
   return (unsigned short)__b[__imm & 7];
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_epi16(__m128i __a, int __b, int __imm)
 {
   __v8hi __c = (__v8hi)__a;
@@ -1310,7 +1326,7 @@
   return (__m128i)__c;
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_epi8(__m128i __a)
 {
   return __builtin_ia32_pmovmskb128((__v16qi)__a);
@@ -1318,158 +1334,158 @@
 
 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
-                                   (__v4si)_mm_set1_epi32(0), \
+                                   (__v4si)_mm_setzero_si128(), \
                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
 
 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
-                                   (__v8hi)_mm_set1_epi16(0), \
+                                   (__v8hi)_mm_setzero_si128(), \
                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
                                    4, 5, 6, 7); })
 
 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
-                                   (__v8hi)_mm_set1_epi16(0), \
+                                   (__v8hi)_mm_setzero_si128(), \
                                    0, 1, 2, 3, \
                                    4 + (((imm) & 0x03) >> 0), \
                                    4 + (((imm) & 0x0c) >> 2), \
                                    4 + (((imm) & 0x30) >> 4), \
                                    4 + (((imm) & 0xc0) >> 6)); })
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_movepi64_pi64(__m128i __a)
 {
   return (__m64)__a[0];
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_movpi64_epi64(__m64 __a)
 {
   return (__m128i){ (long long)__a, 0 };
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_move_epi64(__m128i __a)
 {
   return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpackhi_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector(__a, __b, 1, 2+1);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpacklo_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 2+0);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pd(__m128d __a)
 {
   return __builtin_ia32_movmskpd(__a);
 }
 
 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
-  __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
-                          (i) & 1, (((i) & 2) >> 1) + 2); })
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                   (i) & 1, (((i) & 2) >> 1) + 2); })
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castpd_ps(__m128d __a)
 {
   return (__m128)__a;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castpd_si128(__m128d __a)
 {
   return (__m128i)__a;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castps_pd(__m128 __a)
 {
   return (__m128d)__a;
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castps_si128(__m128 __a)
 {
   return (__m128i)__a;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castsi128_ps(__m128i __a)
 {
   return (__m128)__a;
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castsi128_pd(__m128i __a)
 {
   return (__m128d)__a;
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_pause(void)
 {
-  __asm__ volatile ("pause");
+  __builtin_ia32_pause();
 }
 
-#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+#undef __DEFAULT_FN_ATTRS
 
-#endif /* __SSE2__ */
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
 
 #endif /* __EMMINTRIN_H */
diff --git a/renderscript/clang-include/f16cintrin.h b/renderscript/clang-include/f16cintrin.h
index f3614c0..c655d98 100644
--- a/renderscript/clang-include/f16cintrin.h
+++ b/renderscript/clang-include/f16cintrin.h
@@ -21,38 +21,25 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <f16cintrin.h> directly; include <x86intrin.h> instead."
+#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
 #endif
 
-#ifndef __F16C__
-# error "F16C instruction is not enabled"
-#endif /* __F16C__ */
-
 #ifndef __F16CINTRIN_H
 #define __F16CINTRIN_H
 
-typedef float __v8sf __attribute__ ((__vector_size__ (32)));
-typedef float __m256 __attribute__ ((__vector_size__ (32)));
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
 
 #define _mm_cvtps_ph(a, imm) __extension__ ({ \
-  __m128 __a = (a); \
- (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__a, (imm)); })
+ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
 
-#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
-  __m256 __a = (a); \
- (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__a, (imm)); })
-
-static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_cvtph_ps(__m128i __a)
 {
   return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
 }
 
-static __inline __m256 __attribute__((__always_inline__, __nodebug__))
-_mm256_cvtph_ps(__m128i __a)
-{
-  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
-}
+#undef __DEFAULT_FN_ATTRS
 
 #endif /* __F16CINTRIN_H */
diff --git a/renderscript/clang-include/fma4intrin.h b/renderscript/clang-include/fma4intrin.h
index c30920d..f117887 100644
--- a/renderscript/clang-include/fma4intrin.h
+++ b/renderscript/clang-include/fma4intrin.h
@@ -28,204 +28,203 @@
 #ifndef __FMA4INTRIN_H
 #define __FMA4INTRIN_H
 
-#ifndef __FMA4__
-# error "FMA4 instruction set is not enabled"
-#else
-
 #include <pmmintrin.h>
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
 }
 
-#endif /* __FMA4__ */
+#undef __DEFAULT_FN_ATTRS
 
 #endif /* __FMA4INTRIN_H */
diff --git a/renderscript/clang-include/fmaintrin.h b/renderscript/clang-include/fmaintrin.h
index 6bfd5a8..114a143 100644
--- a/renderscript/clang-include/fmaintrin.h
+++ b/renderscript/clang-include/fmaintrin.h
@@ -28,202 +28,201 @@
 #ifndef __FMAINTRIN_H
 #define __FMAINTRIN_H
 
-#ifndef __FMA__
-# error "FMA instruction set is not enabled"
-#else
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
 }
 
-#endif /* __FMA__ */
+#undef __DEFAULT_FN_ATTRS
 
 #endif /* __FMAINTRIN_H */
diff --git a/renderscript/clang-include/fxsrintrin.h b/renderscript/clang-include/fxsrintrin.h
new file mode 100644
index 0000000..ac6026a
--- /dev/null
+++ b/renderscript/clang-include/fxsrintrin.h
@@ -0,0 +1,55 @@
+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FXSRINTRIN_H
+#define __FXSRINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave(void *__p) {
+  return __builtin_ia32_fxsave(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p) {
+  return __builtin_ia32_fxsave64(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor(void *__p) {
+  return __builtin_ia32_fxrstor(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor64(void *__p) {
+  return __builtin_ia32_fxrstor64(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/renderscript/clang-include/htmxlintrin.h b/renderscript/clang-include/htmxlintrin.h
index 30f524d..c7571ec 100644
--- a/renderscript/clang-include/htmxlintrin.h
+++ b/renderscript/clang-include/htmxlintrin.h
@@ -46,7 +46,7 @@
 
 typedef char TM_buff_type[16];
 
-/* This macro can be used to determine whether a transaction was successfully 
+/* This macro can be used to determine whether a transaction was successfully
    started from the __TM_begin() and __TM_simple_begin() intrinsic functions
    below.  */
 #define _HTM_TBEGIN_STARTED     1
diff --git a/renderscript/clang-include/immintrin.h b/renderscript/clang-include/immintrin.h
index ac7d54a..f3c6d19 100644
--- a/renderscript/clang-include/immintrin.h
+++ b/renderscript/clang-include/immintrin.h
@@ -24,179 +24,149 @@
 #ifndef __IMMINTRIN_H
 #define __IMMINTRIN_H
 
-#ifdef __MMX__
 #include <mmintrin.h>
-#endif
 
-#ifdef __SSE__
 #include <xmmintrin.h>
-#endif
 
-#ifdef __SSE2__
 #include <emmintrin.h>
-#endif
 
-#ifdef __SSE3__
 #include <pmmintrin.h>
-#endif
 
-#ifdef __SSSE3__
 #include <tmmintrin.h>
-#endif
 
-#if defined (__SSE4_2__) || defined (__SSE4_1__)
 #include <smmintrin.h>
-#endif
 
-#if defined (__AES__) || defined (__PCLMUL__)
 #include <wmmintrin.h>
-#endif
 
-#ifdef __AVX__
 #include <avxintrin.h>
-#endif
 
-#ifdef __AVX2__
 #include <avx2intrin.h>
-#endif
 
-#ifdef __BMI__
+/* The 256-bit versions of functions in f16cintrin.h.
+   Intel documents these as being in immintrin.h, and
+   they depend on typedefs from avxintrin.h. */
+
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
 #include <bmiintrin.h>
-#endif
 
-#ifdef __BMI2__
 #include <bmi2intrin.h>
-#endif
 
-#ifdef __LZCNT__
 #include <lzcntintrin.h>
-#endif
 
-#ifdef __FMA__
 #include <fmaintrin.h>
-#endif
 
-#ifdef __AVX512F__
 #include <avx512fintrin.h>
-#endif
 
-#ifdef __AVX512VL__
 #include <avx512vlintrin.h>
-#endif
 
-#ifdef __AVX512BW__
 #include <avx512bwintrin.h>
-#endif
 
-#ifdef __AVX512DQ__
+#include <avx512cdintrin.h>
+
 #include <avx512dqintrin.h>
-#endif
 
-#if defined (__AVX512VL__) && defined (__AVX512BW__)
 #include <avx512vlbwintrin.h>
-#endif
 
-#if defined (__AVX512VL__) && defined (__AVX512DQ__)
 #include <avx512vldqintrin.h>
-#endif
 
-#ifdef __AVX512ER__
 #include <avx512erintrin.h>
-#endif
 
-#ifdef __RDRND__
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
 {
   return __builtin_ia32_rdrand16_step(__p);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand32_step(unsigned int *__p)
 {
   return __builtin_ia32_rdrand32_step(__p);
 }
 
 #ifdef __x86_64__
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand64_step(unsigned long long *__p)
 {
   return __builtin_ia32_rdrand64_step(__p);
 }
 #endif
-#endif /* __RDRND__ */
 
-#ifdef __FSGSBASE__
 #ifdef __x86_64__
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readfsbase_u32(void)
 {
   return __builtin_ia32_rdfsbase32();
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readfsbase_u64(void)
 {
   return __builtin_ia32_rdfsbase64();
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readgsbase_u32(void)
 {
   return __builtin_ia32_rdgsbase32();
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readgsbase_u64(void)
 {
   return __builtin_ia32_rdgsbase64();
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u32(unsigned int __V)
 {
   return __builtin_ia32_wrfsbase32(__V);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u64(unsigned long long __V)
 {
   return __builtin_ia32_wrfsbase64(__V);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u32(unsigned int __V)
 {
   return __builtin_ia32_wrgsbase32(__V);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u64(unsigned long long __V)
 {
   return __builtin_ia32_wrgsbase64(__V);
 }
 #endif
-#endif /* __FSGSBASE__ */
 
-#ifdef __RTM__
 #include <rtmintrin.h>
-#endif
 
-/* FIXME: check __HLE__ as well when HLE is supported. */
-#if defined (__RTM__)
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_xtest(void)
-{
-  return __builtin_ia32_xtest();
-}
-#endif
+#include <xtestintrin.h>
 
-#ifdef __SHA__
 #include <shaintrin.h>
-#endif
 
-/* Some intrinsics inside adxintrin.h are available only if __ADX__ defined,
- * whereas others are also available if __ADX__ undefined */
+#include <fxsrintrin.h>
+
+#include <xsaveintrin.h>
+
+#include <xsaveoptintrin.h>
+
+#include <xsavecintrin.h>
+
+#include <xsavesintrin.h>
+
+/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
+ * whereas others are also available at all times. */
 #include <adxintrin.h>
 
 #endif /* __IMMINTRIN_H */
diff --git a/renderscript/clang-include/inttypes.h b/renderscript/clang-include/inttypes.h
new file mode 100644
index 0000000..3d59d14
--- /dev/null
+++ b/renderscript/clang-include/inttypes.h
@@ -0,0 +1,102 @@
+/*===---- inttypes.h - Standard header for integer printf macros ----------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_INTTYPES_H
+#define __CLANG_INTTYPES_H
+
+#include_next <inttypes.h>
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+/* MSVC headers define int32_t as int, but PRIx32 as "lx" instead of "x".
+ * This triggers format warnings, so fix it up here. */
+#undef PRId32
+#undef PRIdLEAST32
+#undef PRIdFAST32
+#undef PRIi32
+#undef PRIiLEAST32
+#undef PRIiFAST32
+#undef PRIo32
+#undef PRIoLEAST32
+#undef PRIoFAST32
+#undef PRIu32
+#undef PRIuLEAST32
+#undef PRIuFAST32
+#undef PRIx32
+#undef PRIxLEAST32
+#undef PRIxFAST32
+#undef PRIX32
+#undef PRIXLEAST32
+#undef PRIXFAST32
+
+#undef SCNd32
+#undef SCNdLEAST32
+#undef SCNdFAST32
+#undef SCNi32
+#undef SCNiLEAST32
+#undef SCNiFAST32
+#undef SCNo32
+#undef SCNoLEAST32
+#undef SCNoFAST32
+#undef SCNu32
+#undef SCNuLEAST32
+#undef SCNuFAST32
+#undef SCNx32
+#undef SCNxLEAST32
+#undef SCNxFAST32
+
+#define PRId32 "d"
+#define PRIdLEAST32 "d"
+#define PRIdFAST32 "d"
+#define PRIi32 "i"
+#define PRIiLEAST32 "i"
+#define PRIiFAST32 "i"
+#define PRIo32 "o"
+#define PRIoLEAST32 "o"
+#define PRIoFAST32 "o"
+#define PRIu32 "u"
+#define PRIuLEAST32 "u"
+#define PRIuFAST32 "u"
+#define PRIx32 "x"
+#define PRIxLEAST32 "x"
+#define PRIxFAST32 "x"
+#define PRIX32 "X"
+#define PRIXLEAST32 "X"
+#define PRIXFAST32 "X"
+
+#define SCNd32 "d"
+#define SCNdLEAST32 "d"
+#define SCNdFAST32 "d"
+#define SCNi32 "i"
+#define SCNiLEAST32 "i"
+#define SCNiFAST32 "i"
+#define SCNo32 "o"
+#define SCNoLEAST32 "o"
+#define SCNoFAST32 "o"
+#define SCNu32 "u"
+#define SCNuLEAST32 "u"
+#define SCNuFAST32 "u"
+#define SCNx32 "x"
+#define SCNxLEAST32 "x"
+#define SCNxFAST32 "x"
+#endif
+
+#endif /* __CLANG_INTTYPES_H */
diff --git a/renderscript/clang-include/lzcntintrin.h b/renderscript/clang-include/lzcntintrin.h
index 35d6659..4c00e42 100644
--- a/renderscript/clang-include/lzcntintrin.h
+++ b/renderscript/clang-include/lzcntintrin.h
@@ -25,43 +25,44 @@
 #error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
 #endif
 
-#ifndef __LZCNT__
-# error "LZCNT instruction is not enabled"
-#endif /* __LZCNT__ */
-
 #ifndef __LZCNTINTRIN_H
 #define __LZCNTINTRIN_H
 
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
 __lzcnt16(unsigned short __X)
 {
   return __X ? __builtin_clzs(__X) : 16;
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __lzcnt32(unsigned int __X)
 {
   return __X ? __builtin_clz(__X) : 32;
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _lzcnt_u32(unsigned int __X)
 {
   return __X ? __builtin_clz(__X) : 32;
 }
 
 #ifdef __x86_64__
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __lzcnt64(unsigned long long __X)
 {
   return __X ? __builtin_clzll(__X) : 64;
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _lzcnt_u64(unsigned long long __X)
 {
   return __X ? __builtin_clzll(__X) : 64;
 }
 #endif
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __LZCNTINTRIN_H */
diff --git a/renderscript/clang-include/mm3dnow.h b/renderscript/clang-include/mm3dnow.h
index 5242d99..cb93faf 100644
--- a/renderscript/clang-include/mm3dnow.h
+++ b/renderscript/clang-include/mm3dnow.h
@@ -29,134 +29,143 @@
 
 typedef float __v2sf __attribute__((__vector_size__(8)));
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
 _m_femms() {
   __builtin_ia32_femms();
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pavgusb(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pf2id(__m64 __m) {
   return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfacc(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfadd(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfcmpeq(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfcmpge(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfcmpgt(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfmax(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfmin(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfmul(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfrcp(__m64 __m) {
   return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfrcpit1(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfrcpit2(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfrsqrt(__m64 __m) {
   return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfsub(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfsubr(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pi2fd(__m64 __m) {
   return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pmulhrw(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+/* Handle the 3dnowa instructions here. */
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pf2iw(__m64 __m) {
   return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfnacc(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pfpnacc(__m64 __m1, __m64 __m2) {
   return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pi2fw(__m64 __m) {
   return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pswapdsf(__m64 __m) {
   return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pswapdsi(__m64 __m) {
   return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
 }
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif
diff --git a/renderscript/clang-include/mmintrin.h b/renderscript/clang-include/mmintrin.h
index 986870a..162cb1a 100644
--- a/renderscript/clang-include/mmintrin.h
+++ b/renderscript/clang-include/mmintrin.h
@@ -24,377 +24,376 @@
 #ifndef __MMINTRIN_H
 #define __MMINTRIN_H
 
-#ifndef __MMX__
-#error "MMX instruction set not enabled"
-#else
-
 typedef long long __m64 __attribute__((__vector_size__(8)));
 
 typedef int __v2si __attribute__((__vector_size__(8)));
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_empty(void)
 {
     __builtin_ia32_emms();
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtsi32_si64(int __i)
 {
     return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si32(__m64 __m)
 {
     return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtsi64_m64(long long __i)
 {
     return (__m64)__i;
 }
 
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtm64_si64(__m64 __m)
 {
     return (long long)__m;
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
-_mm_adds_pi8(__m64 __m1, __m64 __m2) 
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);    
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
-_mm_adds_pu8(__m64 __m1, __m64 __m2) 
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }
- 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
-_mm_adds_pu16(__m64 __m1, __m64 __m2) 
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
 }
- 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
 }
- 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
 }
- 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
 }
- 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
-_mm_mullo_pi16(__m64 __m1, __m64 __m2) 
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);    
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psllq(__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllqi(__m, __count);    
+    return (__m64)__builtin_ia32_psllqi(__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);    
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srai_pi16(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);    
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srai_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);    
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);    
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);       
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlq(__m, __count);    
+    return (__m64)__builtin_ia32_psrlq(__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlqi(__m, __count);    
+    return (__m64)__builtin_ia32_psrlqi(__m, __count);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
     return __builtin_ia32_pand(__m1, __m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
     return __builtin_ia32_pandn(__m1, __m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
     return __builtin_ia32_por(__m1, __m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
     return __builtin_ia32_pxor(__m1, __m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setzero_si64(void)
 {
     return (__m64){ 0LL };
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi32(int __i1, int __i0)
 {
     return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 {
     return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
             char __b1, char __b0)
 {
@@ -402,48 +401,51 @@
                                                __b4, __b5, __b6, __b7);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi32(int __i)
 {
     return _mm_set_pi32(__i, __i);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi16(short __w)
 {
     return _mm_set_pi16(__w, __w, __w, __w);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi8(char __b)
 {
     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi32(int __i0, int __i1)
 {
     return _mm_set_pi32(__i1, __i0);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 {
     return _mm_set_pi16(__w3, __w2, __w1, __w0);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
              char __b6, char __b7)
 {
     return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }
 
+#undef __DEFAULT_FN_ATTRS
 
 /* Aliases for compatibility. */
 #define _m_empty _mm_empty
 #define _m_from_int _mm_cvtsi32_si64
+#define _m_from_int64 _mm_cvtsi64_m64
 #define _m_to_int _mm_cvtsi64_si32
+#define _m_to_int64 _mm_cvtm64_si64
 #define _m_packsswb _mm_packs_pi16
 #define _m_packssdw _mm_packs_pi32
 #define _m_packuswb _mm_packs_pu16
@@ -497,7 +499,5 @@
 #define _m_pcmpgtw _mm_cmpgt_pi16
 #define _m_pcmpgtd _mm_cmpgt_pi32
 
-#endif /* __MMX__ */
-
 #endif /* __MMINTRIN_H */
 
diff --git a/renderscript/clang-include/nmmintrin.h b/renderscript/clang-include/nmmintrin.h
index f12622d..57fec15 100644
--- a/renderscript/clang-include/nmmintrin.h
+++ b/renderscript/clang-include/nmmintrin.h
@@ -24,12 +24,7 @@
 #ifndef _NMMINTRIN_H
 #define _NMMINTRIN_H
 
-#ifndef __SSE4_2__
-#error "SSE4.2 instruction set not enabled"
-#else
-
 /* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
    just include it now then.  */
 #include <smmintrin.h>
-#endif /* __SSE4_2__ */
 #endif /* _NMMINTRIN_H */
diff --git a/renderscript/clang-include/pmmintrin.h b/renderscript/clang-include/pmmintrin.h
index 6f1fc32..0ff9409 100644
--- a/renderscript/clang-include/pmmintrin.h
+++ b/renderscript/clang-include/pmmintrin.h
@@ -20,65 +20,64 @@
  *
  *===-----------------------------------------------------------------------===
  */
- 
+
 #ifndef __PMMINTRIN_H
 #define __PMMINTRIN_H
 
-#ifndef __SSE3__
-#error "SSE3 instruction set not enabled"
-#else
-
 #include <emmintrin.h>
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_lddqu_si128(__m128i const *__p)
 {
   return (__m128i)__builtin_ia32_lddqu((char const *)__p);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_addsub_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_addsubps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_hadd_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_haddps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_hsub_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_hsubps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movehdup_ps(__m128 __a)
 {
   return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_moveldup_ps(__m128 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_addsub_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_addsubpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_hadd_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_haddpd(__a, __b);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_hsub_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_hsubpd(__a, __b);
@@ -86,7 +85,7 @@
 
 #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_movedup_pd(__m128d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 0);
@@ -100,18 +99,18 @@
 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
 {
   __builtin_ia32_monitor((void *)__p, __extensions, __hints);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mwait(unsigned __extensions, unsigned __hints)
 {
   __builtin_ia32_mwait(__extensions, __hints);
 }
 
-#endif /* __SSE3__ */
+#undef __DEFAULT_FN_ATTRS
 
 #endif /* __PMMINTRIN_H */
diff --git a/renderscript/clang-include/popcntintrin.h b/renderscript/clang-include/popcntintrin.h
index d439daa..6fcda65 100644
--- a/renderscript/clang-include/popcntintrin.h
+++ b/renderscript/clang-include/popcntintrin.h
@@ -21,25 +21,38 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __POPCNT__
-#error "POPCNT instruction set not enabled"
-#endif
-
 #ifndef _POPCNTINTRIN_H
 #define _POPCNTINTRIN_H
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_popcnt_u32(unsigned int __A)
 {
   return __builtin_popcount(__A);
 }
 
+static __inline__ int __DEFAULT_FN_ATTRS
+_popcnt32(int __A)
+{
+  return __builtin_popcount(__A);
+}
+
 #ifdef __x86_64__
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_popcnt_u64(unsigned long long __A)
 {
   return __builtin_popcountll(__A);
 }
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_popcnt64(long long __A)
+{
+  return __builtin_popcountll(__A);
+}
 #endif /* __x86_64__ */
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* _POPCNTINTRIN_H */
diff --git a/renderscript/clang-include/prfchwintrin.h b/renderscript/clang-include/prfchwintrin.h
index 9825bd8..ba02857 100644
--- a/renderscript/clang-include/prfchwintrin.h
+++ b/renderscript/clang-include/prfchwintrin.h
@@ -30,6 +30,12 @@
 
 #if defined(__PRFCHW__) || defined(__3dNOW__)
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetch(void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _m_prefetchw(void *__P)
 {
   __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
diff --git a/renderscript/clang-include/rdseedintrin.h b/renderscript/clang-include/rdseedintrin.h
index 0fef1fa..421f4ea 100644
--- a/renderscript/clang-include/rdseedintrin.h
+++ b/renderscript/clang-include/rdseedintrin.h
@@ -28,25 +28,29 @@
 #ifndef __RDSEEDINTRIN_H
 #define __RDSEEDINTRIN_H
 
-#ifdef __RDSEED__
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed16_step(unsigned short *__p)
 {
   return __builtin_ia32_rdseed16_step(__p);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed32_step(unsigned int *__p)
 {
   return __builtin_ia32_rdseed32_step(__p);
 }
 
 #ifdef __x86_64__
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed64_step(unsigned long long *__p)
 {
   return __builtin_ia32_rdseed64_step(__p);
 }
 #endif
-#endif /* __RDSEED__ */
+
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __RDSEEDINTRIN_H */
diff --git a/renderscript/clang-include/rtmintrin.h b/renderscript/clang-include/rtmintrin.h
index 26149ca..e6a58d7 100644
--- a/renderscript/clang-include/rtmintrin.h
+++ b/renderscript/clang-include/rtmintrin.h
@@ -37,13 +37,16 @@
 #define _XABORT_NESTED    (1 << 5)
 #define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _xbegin(void)
 {
   return __builtin_ia32_xbegin();
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _xend(void)
 {
   __builtin_ia32_xend();
@@ -51,4 +54,6 @@
 
 #define _xabort(imm) __builtin_ia32_xabort((imm))
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __RTMINTRIN_H */
diff --git a/renderscript/clang-include/s390intrin.h b/renderscript/clang-include/s390intrin.h
index b209895..d51274c 100644
--- a/renderscript/clang-include/s390intrin.h
+++ b/renderscript/clang-include/s390intrin.h
@@ -32,4 +32,8 @@
 #include <htmintrin.h>
 #endif
 
+#ifdef __VEC__
+#include <vecintrin.h>
+#endif
+
 #endif /* __S390INTRIN_H*/
diff --git a/renderscript/clang-include/shaintrin.h b/renderscript/clang-include/shaintrin.h
index 391a4bb..9b5d218 100644
--- a/renderscript/clang-include/shaintrin.h
+++ b/renderscript/clang-include/shaintrin.h
@@ -28,47 +28,48 @@
 #ifndef __SHAINTRIN_H
 #define __SHAINTRIN_H
 
-#if !defined (__SHA__)
-#  error "SHA instructions not enabled"
-#endif
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
 
 #define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
-  __builtin_ia32_sha1rnds4((V1), (V2), (M)); })
+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
 {
   return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
 }
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __SHAINTRIN_H */
diff --git a/renderscript/clang-include/smmintrin.h b/renderscript/clang-include/smmintrin.h
index 6e35734..69ad07f 100644
--- a/renderscript/clang-include/smmintrin.h
+++ b/renderscript/clang-include/smmintrin.h
@@ -24,12 +24,11 @@
 #ifndef _SMMINTRIN_H
 #define _SMMINTRIN_H
 
-#ifndef __SSE4_1__
-#error "SSE4.1 instruction set not enabled"
-#else
-
 #include <tmmintrin.h>
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
+
 /* SSE4 Rounding macros. */
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -58,55 +57,48 @@
 #define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
 
 #define _mm_round_ps(X, M) __extension__ ({ \
-  __m128 __X = (X); \
-  (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); })
+  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
 
 #define _mm_round_ss(X, Y, M) __extension__ ({ \
-  __m128 __X = (X); \
-  __m128 __Y = (Y); \
-  (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); })
+  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+                                 (__v4sf)(__m128)(Y), (M)); })
 
 #define _mm_round_pd(X, M) __extension__ ({ \
-  __m128d __X = (X); \
-  (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); })
+  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
 
 #define _mm_round_sd(X, Y, M) __extension__ ({ \
-  __m128d __X = (X); \
-  __m128d __Y = (Y); \
-  (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); })
+  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+                                  (__v2df)(__m128d)(Y), (M)); })
 
 /* SSE4 Packed Blending Intrinsics.  */
 #define _mm_blend_pd(V1, V2, M) __extension__ ({ \
-  __m128d __V1 = (V1); \
-  __m128d __V2 = (V2); \
-  (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
+                                   (__v2df)(__m128d)(V2), \
                                    (((M) & 0x01) ? 2 : 0), \
                                    (((M) & 0x02) ? 3 : 1)); })
 
 #define _mm_blend_ps(V1, V2, M) __extension__ ({ \
-  __m128 __V1 = (V1); \
-  __m128 __V2 = (V2); \
-  (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
                                   (((M) & 0x01) ? 4 : 0), \
                                   (((M) & 0x02) ? 5 : 1), \
                                   (((M) & 0x04) ? 6 : 2), \
                                   (((M) & 0x08) ? 7 : 3)); })
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
 {
   return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
                                             (__v2df)__M);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
 {
   return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
                                            (__v4sf)__M);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 {
   return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
@@ -114,9 +106,8 @@
 }
 
 #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
-  __m128i __V1 = (V1); \
-  __m128i __V2 = (V2); \
-  (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
+                                   (__v8hi)(__m128i)(V2), \
                                    (((M) & 0x01) ?  8 : 0), \
                                    (((M) & 0x02) ?  9 : 1), \
                                    (((M) & 0x04) ? 10 : 2), \
@@ -127,13 +118,13 @@
                                    (((M) & 0x80) ? 15 : 7)); })
 
 /* SSE4 Dword Multiply Instructions.  */
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
 }
 
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
@@ -141,66 +132,64 @@
 
 /* SSE4 Floating Point Dot Product Instructions.  */
 #define _mm_dp_ps(X, Y, M) __extension__ ({ \
-  __m128 __X = (X); \
-  __m128 __Y = (Y); \
-  (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); })
+  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+                               (__v4sf)(__m128)(Y), (M)); })
 
 #define _mm_dp_pd(X, Y, M) __extension__ ({\
-  __m128d __X = (X); \
-  __m128d __Y = (Y); \
-  (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); })
+  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+                                (__v2df)(__m128d)(Y), (M)); })
 
 /* SSE4 Streaming Load Hint Instruction.  */
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
-_mm_stream_load_si128 (__m128i *__V)
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_stream_load_si128 (__m128i const *__V)
 {
-  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
+  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
 }
 
 /* SSE4 Packed Integer Min/Max Instructions.  */
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi8 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi8 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu16 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu16 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
 }
 
-static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
@@ -210,7 +199,7 @@
 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
 #define _mm_extract_ps(X, N) (__extension__                      \
                               ({ union { int __i; float __f; } __t;  \
-                                 __v4sf __a = (__v4sf)(X);       \
+                                 __v4sf __a = (__v4sf)(__m128)(X);       \
                                  __t.__f = __a[(N) & 3];                 \
                                  __t.__i;}))
 
@@ -218,55 +207,60 @@
 /* Extract a single-precision float from X at index N into D.  */
 #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
                                                     (D) = __a[N]; }))
-                                                    
+
 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
    an index suitable for _mm_insert_ps.  */
 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
-                                           
+
 /* Extract a float from X at index N into the first index of the return.  */
 #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
                                              _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
-                                             
+
 /* Insert int into packed integer array at index.  */
-#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
-                                                   __a[(N) & 15] = (I);             \
-                                                   __a;}))
-#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
-                                                    __a[(N) & 3] = (I);           \
-                                                    __a;}))
+#define _mm_insert_epi8(X, I, N) (__extension__                           \
+                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                     __a[(N) & 15] = (I);                 \
+                                     __a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__                         \
+                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                      __a[(N) & 3] = (I);                \
+                                      __a;}))
 #ifdef __x86_64__
-#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
-                                                    __a[(N) & 1] = (I);           \
-                                                    __a;}))
+#define _mm_insert_epi64(X, I, N) (__extension__                         \
+                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                      __a[(N) & 1] = (I);                \
+                                      __a;}))
 #endif /* __x86_64__ */
 
 /* Extract int from packed integer array at index.  This returns the element
  * as a zero extended value, so it is unsigned.
  */
-#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
-                                                 (int)(unsigned char) \
-                                                     __a[(N) & 15];}))
-#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
-                                                  __a[(N) & 3];}))
+#define _mm_extract_epi8(X, N) (__extension__                           \
+                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                   (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__                         \
+                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                    (int)__a[(N) & 3];}))
 #ifdef __x86_64__
-#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
-                                                  __a[(N) & 1];}))
+#define _mm_extract_epi64(X, N) (__extension__                         \
+                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                    (long long)__a[(N) & 1];}))
 #endif /* __x86_64 */
 
 /* SSE4 128-bit Packed Integer Comparisons.  */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testz_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testc_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testnzc_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
@@ -277,88 +271,95 @@
 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
 
 /* SSE4 64-bit Packed Integer Comparisons.  */
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 {
   return (__m128i)((__v2di)__V1 == (__v2di)__V2);
 }
 
 /* SSE4 Packed Integer Sign-Extension.  */
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi16(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi32(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi64(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  typedef signed char __v16qs __attribute__((__vector_size__(16)));
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi32(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V); 
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi64(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi64(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
 }
 
 /* SSE4 Packed Integer Zero-Extension.  */
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi16(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi32(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi64(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi32(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi64(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu32_epi64(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi32(__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
@@ -366,19 +367,22 @@
 
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
-  __m128i __X = (X); \
-  __m128i __Y = (Y); \
-  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); })
+  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+                                      (__v16qi)(__m128i)(Y), (M)); })
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_minpos_epu16(__m128i __V)
 {
   return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
 }
 
+/* Handle the sse4.2 definitions here. */
+
 /* These definitions are normally in nmmintrin.h, but gcc puts them in here
    so we'll do the same.  */
-#ifdef __SSE4_2__
+
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
 
 /* These specify the type of data that we're comparing.  */
 #define _SIDD_UBYTE_OPS                 0x00
@@ -407,76 +411,98 @@
 #define _SIDD_UNIT_MASK                 0x40
 
 /* SSE4.2 Packed Comparison Intrinsics.  */
-#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
-#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
+#define _mm_cmpistrm(A, B, M) \
+  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+                                       (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistri(A, B, M) \
+  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (int)(M))
 
 #define _mm_cmpestrm(A, LA, B, LB, M) \
-     __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
+  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+                                       (__v16qi)(__m128i)(B), (int)(LB), \
+                                       (int)(M))
 #define _mm_cmpestri(A, LA, B, LB, M) \
-     __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
-     
+  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+                                   (__v16qi)(__m128i)(B), (int)(LB), \
+                                   (int)(M))
+
 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
 #define _mm_cmpistra(A, B, M) \
-     __builtin_ia32_pcmpistria128((A), (B), (M))
+  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
 #define _mm_cmpistrc(A, B, M) \
-     __builtin_ia32_pcmpistric128((A), (B), (M))
+  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
 #define _mm_cmpistro(A, B, M) \
-     __builtin_ia32_pcmpistrio128((A), (B), (M))
+  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
 #define _mm_cmpistrs(A, B, M) \
-     __builtin_ia32_pcmpistris128((A), (B), (M))
+  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
 #define _mm_cmpistrz(A, B, M) \
-     __builtin_ia32_pcmpistriz128((A), (B), (M))
+  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
 
 #define _mm_cmpestra(A, LA, B, LB, M) \
-     __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
+  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
 #define _mm_cmpestrc(A, LA, B, LB, M) \
-     __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
+  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
 #define _mm_cmpestro(A, LA, B, LB, M) \
-     __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
+  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
 #define _mm_cmpestrs(A, LA, B, LB, M) \
-     __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
+  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
 #define _mm_cmpestrz(A, LA, B, LB, M) \
-     __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
+  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
 
 /* SSE4.2 Compare Packed Data -- Greater Than.  */
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
 {
   return (__m128i)((__v2di)__V1 > (__v2di)__V2);
 }
 
 /* SSE4.2 Accumulate CRC32.  */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u8(unsigned int __C, unsigned char __D)
 {
   return __builtin_ia32_crc32qi(__C, __D);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u16(unsigned int __C, unsigned short __D)
 {
   return __builtin_ia32_crc32hi(__C, __D);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u32(unsigned int __C, unsigned int __D)
 {
   return __builtin_ia32_crc32si(__C, __D);
 }
 
 #ifdef __x86_64__
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
 {
   return __builtin_ia32_crc32di(__C, __D);
 }
 #endif /* __x86_64__ */
 
+#undef __DEFAULT_FN_ATTRS
+
 #ifdef __POPCNT__
 #include <popcntintrin.h>
 #endif
 
-#endif /* __SSE4_2__ */
-#endif /* __SSE4_1__ */
-
 #endif /* _SMMINTRIN_H */
diff --git a/renderscript/clang-include/stdint.h b/renderscript/clang-include/stdint.h
index 0303db9..3f2fcbc 100644
--- a/renderscript/clang-include/stdint.h
+++ b/renderscript/clang-include/stdint.h
@@ -77,14 +77,14 @@
  * C99 7.18.1.2 Minimum-width integer types.
  * C99 7.18.1.3 Fastest minimum-width integer types.
  *
- * The standard requires that exact-width type be defined for 8-, 16-, 32-, and 
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and
  * 64-bit types if they are implemented. Other exact width types are optional.
  * This implementation defines an exact-width types for every integer width
  * that is represented in the standard integer types.
  *
  * The standard also requires minimum-width types be defined for 8-, 16-, 32-,
  * and 64-bit widths regardless of whether there are corresponding exact-width
- * types. 
+ * types.
  *
  * To accommodate targets that are missing types that are exactly 8, 16, 32, or
  * 64 bits wide, this implementation takes an approach of cascading
@@ -97,7 +97,7 @@
  * suboptimal.
  *
  * In violation of the standard, some targets do not implement a type that is
- * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).  
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
  * To accommodate these targets, a required minimum-width type is only
  * defined if there exists an exact-width type of equal or greater width.
  */
@@ -247,7 +247,7 @@
 #endif /* __int_least8_t */
 
 /* prevent glibc sys/types.h from defining conflicting types */
-#ifndef __int8_t_defined  
+#ifndef __int8_t_defined
 # define __int8_t_defined
 #endif /* __int8_t_defined */
 
@@ -280,9 +280,9 @@
  *
  * The standard requires that integer constant macros be defined for all the
  * minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
- * types are required, the corresponding integer constant macros are defined 
+ * types are required, the corresponding integer constant macros are defined
  * here. This implementation also defines minimum-width types for every other
- * integer width that the target implements, so corresponding macros are 
+ * integer width that the target implements, so corresponding macros are
  * defined below, too.
  *
  * These macros are defined using the same successive-shrinking approach as
@@ -452,7 +452,7 @@
 #endif /* __int_least8_t */
 
 
-/* C99 7.18.2.1 Limits of exact-width integer types. 
+/* C99 7.18.2.1 Limits of exact-width integer types.
  * C99 7.18.2.2 Limits of minimum-width integer types.
  * C99 7.18.2.3 Limits of fastest minimum-width integer types.
  *
diff --git a/renderscript/clang-include/tbmintrin.h b/renderscript/clang-include/tbmintrin.h
index f95e34f..785961c 100644
--- a/renderscript/clang-include/tbmintrin.h
+++ b/renderscript/clang-include/tbmintrin.h
@@ -21,10 +21,6 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __TBM__
-#error "TBM instruction set is not enabled"
-#endif
-
 #ifndef __X86INTRIN_H
 #error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
 #endif
@@ -32,127 +28,127 @@
 #ifndef __TBMINTRIN_H
 #define __TBMINTRIN_H
 
-#define __bextri_u32(a, b) (__builtin_ia32_bextri_u32((a), (b)))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+#define __bextri_u32(a, b) \
+  ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
+                                           (unsigned int)(b)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blcfill_u32(unsigned int a)
 {
   return a & (a + 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blci_u32(unsigned int a)
 {
   return a | ~(a + 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blcic_u32(unsigned int a)
 {
   return ~a & (a + 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blcmsk_u32(unsigned int a)
 {
   return a ^ (a + 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blcs_u32(unsigned int a)
 {
   return a | (a + 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsfill_u32(unsigned int a)
 {
   return a | (a - 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsic_u32(unsigned int a)
 {
   return ~a | (a - 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __t1mskc_u32(unsigned int a)
 {
   return ~a | (a + 1);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __tzmsk_u32(unsigned int a)
 {
   return ~a & (a - 1);
 }
 
 #ifdef __x86_64__
-#define __bextri_u64(a, b) (__builtin_ia32_bextri_u64((a), (int)(b)))
+#define __bextri_u64(a, b) \
+  ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
+                                                 (unsigned long long)(b)))
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blcfill_u64(unsigned long long a)
 {
   return a & (a + 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blci_u64(unsigned long long a)
 {
   return a | ~(a + 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blcic_u64(unsigned long long a)
 {
   return ~a & (a + 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blcmsk_u64(unsigned long long a)
 {
   return a ^ (a + 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blcs_u64(unsigned long long a)
 {
   return a | (a + 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsfill_u64(unsigned long long a)
 {
   return a | (a - 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsic_u64(unsigned long long a)
 {
   return ~a | (a - 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __t1mskc_u64(unsigned long long a)
 {
   return ~a | (a + 1);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__,
-                                                    __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __tzmsk_u64(unsigned long long a)
 {
   return ~a & (a - 1);
 }
 #endif
 
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __TBMINTRIN_H */
diff --git a/renderscript/clang-include/tgmath.h b/renderscript/clang-include/tgmath.h
index a48e267..318e118 100644
--- a/renderscript/clang-include/tgmath.h
+++ b/renderscript/clang-include/tgmath.h
@@ -490,7 +490,7 @@
 
 static long double _Complex
     _TG_ATTRS
-    __tg_pow(long double _Complex __x, long double _Complex __y) 
+    __tg_pow(long double _Complex __x, long double _Complex __y)
     {return cpowl(__x, __y);}
 
 #undef pow
diff --git a/renderscript/clang-include/tmmintrin.h b/renderscript/clang-include/tmmintrin.h
index 4238f5b..0002890 100644
--- a/renderscript/clang-include/tmmintrin.h
+++ b/renderscript/clang-include/tmmintrin.h
@@ -20,206 +20,202 @@
  *
  *===-----------------------------------------------------------------------===
  */
- 
+
 #ifndef __TMMINTRIN_H
 #define __TMMINTRIN_H
 
-#ifndef __SSSE3__
-#error "SSSE3 instruction set not enabled"
-#else
-
 #include <pmmintrin.h>
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi8(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi16(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi32(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 }
 
 #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
-  __m128i __a = (a); \
-  __m128i __b = (b); \
-  (__m128i)__builtin_ia32_palignr128((__v16qi)__a, (__v16qi)__b, (n)); })
+  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+                                     (__v16qi)(__m128i)(b), (n)); })
 
 #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
-  __m64 __a = (a); \
-  __m64 __b = (b); \
-  (__m64)__builtin_ia32_palignr((__v8qi)__a, (__v8qi)__b, (n)); })
+  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadd_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadd_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadds_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsub_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsub_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsubs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shuffle_epi8(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi8(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
 }
 
-#endif /* __SSSE3__ */
+#undef __DEFAULT_FN_ATTRS
 
 #endif /* __TMMINTRIN_H */
diff --git a/renderscript/clang-include/vecintrin.h b/renderscript/clang-include/vecintrin.h
new file mode 100644
index 0000000..ca7acb4
--- /dev/null
+++ b/renderscript/clang-include/vecintrin.h
@@ -0,0 +1,8946 @@
+/*===---- vecintrin.h - Vector intrinsics ----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if defined(__s390x__) && defined(__VEC__)
+
+#define __ATTRS_ai __attribute__((__always_inline__))
+#define __ATTRS_o __attribute__((__overloadable__))
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+#define __constant(PARM) \
+  __attribute__((__enable_if__ ((PARM) == (PARM), \
+     "argument must be a constant integer")))
+#define __constant_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH), \
+     "argument must be a constant integer from " #LOW " to " #HIGH)))
+#define __constant_pow2_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH) && \
+                                ((PARM) & ((PARM) - 1)) == 0, \
+     "argument must be a constant power of 2 from " #LOW " to " #HIGH)))
+
+/*-- __lcbb -----------------------------------------------------------------*/
+
+extern __ATTRS_o unsigned int
+__lcbb(const void *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define __lcbb(X, Y) ((__typeof__((__lcbb)((X), (Y)))) \
+  __builtin_s390_lcbb((X), __builtin_constant_p((Y))? \
+                           ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : 0) : 0))
+
+/*-- vec_extract ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai signed char
+vec_extract(vector signed char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector bool char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector unsigned char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai signed short
+vec_extract(vector signed short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector bool short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector unsigned short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai signed int
+vec_extract(vector signed int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector bool int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector unsigned int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai signed long long
+vec_extract(vector signed long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector bool long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector unsigned long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai double
+vec_extract(vector double __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+/*-- vec_insert -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert(signed char __scalar, vector signed char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector bool char __vec, int __index) {
+  vector unsigned char __newvec = (vector unsigned char)__vec;
+  __newvec[__index & 15] = (unsigned char)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector unsigned char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert(signed short __scalar, vector signed short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector bool short __vec, int __index) {
+  vector unsigned short __newvec = (vector unsigned short)__vec;
+  __newvec[__index & 7] = (unsigned short)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector unsigned short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert(signed int __scalar, vector signed int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector bool int __vec, int __index) {
+  vector unsigned int __newvec = (vector unsigned int)__vec;
+  __newvec[__index & 3] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector unsigned int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert(signed long long __scalar, vector signed long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector bool long long __vec,
+           int __index) {
+  vector unsigned long long __newvec = (vector unsigned long long)__vec;
+  __newvec[__index & 1] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector unsigned long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert(double __scalar, vector double __vec, int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_promote ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_promote(signed char __scalar, int __index) {
+  const vector signed char __zero = (vector signed char)0;
+  vector signed char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_promote(unsigned char __scalar, int __index) {
+  const vector unsigned char __zero = (vector unsigned char)0;
+  vector unsigned char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_promote(signed short __scalar, int __index) {
+  const vector signed short __zero = (vector signed short)0;
+  vector signed short __vec = __builtin_shufflevector(__zero, __zero,
+                                -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_promote(unsigned short __scalar, int __index) {
+  const vector unsigned short __zero = (vector unsigned short)0;
+  vector unsigned short __vec = __builtin_shufflevector(__zero, __zero,
+                                  -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_promote(signed int __scalar, int __index) {
+  const vector signed int __zero = (vector signed int)0;
+  vector signed int __vec = __builtin_shufflevector(__zero, __zero,
+                                                    -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_promote(unsigned int __scalar, int __index) {
+  const vector unsigned int __zero = (vector unsigned int)0;
+  vector unsigned int __vec = __builtin_shufflevector(__zero, __zero,
+                                                      -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_promote(signed long long __scalar, int __index) {
+  const vector signed long long __zero = (vector signed long long)0;
+  vector signed long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                          -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_promote(unsigned long long __scalar, int __index) {
+  const vector unsigned long long __zero = (vector unsigned long long)0;
+  vector unsigned long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                            -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_promote(double __scalar, int __index) {
+  const vector double __zero = (vector double)0;
+  vector double __vec = __builtin_shufflevector(__zero, __zero, -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_insert_and_zero ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert_and_zero(const signed char *__ptr) {
+  vector signed char __vec = (vector signed char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert_and_zero(const unsigned char *__ptr) {
+  vector unsigned char __vec = (vector unsigned char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert_and_zero(const signed short *__ptr) {
+  vector signed short __vec = (vector signed short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert_and_zero(const unsigned short *__ptr) {
+  vector unsigned short __vec = (vector unsigned short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert_and_zero(const signed int *__ptr) {
+  vector signed int __vec = (vector signed int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert_and_zero(const unsigned int *__ptr) {
+  vector unsigned int __vec = (vector unsigned int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert_and_zero(const signed long long *__ptr) {
+  vector signed long long __vec = (vector signed long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert_and_zero(const unsigned long long *__ptr) {
+  vector unsigned long long __vec = (vector unsigned long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert_and_zero(const double *__ptr) {
+  vector double __vec = (vector double)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+/*-- vec_perm ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_perm(vector signed char __a, vector signed char __b,
+         vector unsigned char __c) {
+  return (vector signed char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+  return (vector unsigned char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_perm(vector bool char __a, vector bool char __b,
+         vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_perm(vector signed short __a, vector signed short __b,
+         vector unsigned char __c) {
+  return (vector signed short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c) {
+  return (vector unsigned short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_perm(vector bool short __a, vector bool short __b,
+         vector unsigned char __c) {
+  return (vector bool short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_perm(vector signed int __a, vector signed int __b,
+         vector unsigned char __c) {
+  return (vector signed int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_perm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned char __c) {
+  return (vector unsigned int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_perm(vector bool int __a, vector bool int __b,
+         vector unsigned char __c) {
+  return (vector bool int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c) {
+  return (vector signed long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+  return (vector unsigned long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+  return (vector bool long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_perm(vector double __a, vector double __b,
+         vector unsigned char __c) {
+  return (vector double)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+/*-- vec_permi --------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed long long
+vec_permi(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector bool long long
+vec_permi(vector bool long long __a, vector bool long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_permi(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_permi(X, Y, Z) ((__typeof__((vec_permi)((X), (Y), (Z)))) \
+  __builtin_s390_vpdi((vector unsigned long long)(X), \
+                      (vector unsigned long long)(Y), \
+                      (((Z) & 2) << 1) | ((Z) & 1)))
+
+/*-- vec_sel ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b,
+        vector unsigned char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return ((vector bool char)__c & __b) | (~(vector bool char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector bool char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector unsigned char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector bool char __c) {
+  return ((vector unsigned char)__c & __b) | (~(vector unsigned char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector unsigned short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector bool short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b,
+        vector unsigned short __c) {
+  return ((vector bool short)__c & __b) | (~(vector bool short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector unsigned short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector bool short __c) {
+  return (((vector unsigned short)__c & __b) |
+          (~(vector unsigned short)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b,
+        vector unsigned int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b, vector bool int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return ((vector bool int)__c & __b) | (~(vector bool int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector bool int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b,
+        vector unsigned int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return ((vector unsigned int)__c & __b) | (~(vector unsigned int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector unsigned long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector bool long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector unsigned long long __c) {
+  return (((vector bool long long)__c & __b) |
+          (~(vector bool long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector bool long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector unsigned long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector bool long long __c) {
+  return (((vector unsigned long long)__c & __b) |
+          (~(vector unsigned long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
+  return (vector double)((__c & (vector unsigned long long)__b) |
+                         (~__c & (vector unsigned long long)__a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector bool long long __c) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  vector unsigned long long __cc = (vector unsigned long long)__c;
+  return (vector double)((__cc & __bc) | (~__cc & __ac));
+}
+
+/*-- vec_gather_element -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed int
+vec_gather_element(vector signed int __vec, vector unsigned int __offset,
+                   const signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const signed int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_gather_element(vector bool int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gather_element(vector unsigned int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_gather_element(vector signed long long __vec,
+                   vector unsigned long long __offset,
+                   const signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const signed long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_gather_element(vector bool long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gather_element(vector unsigned long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_gather_element(vector double __vec, vector unsigned long long __offset,
+                   const double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const double *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+/*-- vec_scatter_element ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed int __vec, vector unsigned int __offset,
+                    signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(signed int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed long long __vec,
+                    vector unsigned long long __offset,
+                    signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(signed long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector double __vec, vector unsigned long long __offset,
+                    double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(double *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+/*-- vec_xld2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xld2(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xld2(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xld2(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xld2(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xld2(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xld2(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_xld2(long __offset, const signed long long *__ptr) {
+  return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xld2(long __offset, const unsigned long long *__ptr) {
+  return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_xld2(long __offset, const double *__ptr) {
+  return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xlw4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xlw4(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xlw4(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xlw4(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xlw4(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xlw4(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xlw4(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xstd2 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed long long __vec, long __offset,
+          signed long long *__ptr) {
+  *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned long long __vec, long __offset,
+          unsigned long long *__ptr) {
+  *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) =
+    __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector double __vec, long __offset, double *__ptr) {
+  *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_xstw4 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_load_bndry ---------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_load_bndry(const signed char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned char
+vec_load_bndry(const unsigned char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed short
+vec_load_bndry(const signed short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned short
+vec_load_bndry(const unsigned short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed int
+vec_load_bndry(const signed int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned int
+vec_load_bndry(const unsigned int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed long long
+vec_load_bndry(const signed long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned long long
+vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector double
+vec_load_bndry(const double *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define vec_load_bndry(X, Y) ((__typeof__((vec_load_bndry)((X), (Y)))) \
+  __builtin_s390_vlbb((X), ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : -1)))
+
+/*-- vec_load_len -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_load_len(const signed char *__ptr, unsigned int __len) {
+  return (vector signed char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_load_len(const unsigned char *__ptr, unsigned int __len) {
+  return (vector unsigned char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_load_len(const signed short *__ptr, unsigned int __len) {
+  return (vector signed short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_load_len(const unsigned short *__ptr, unsigned int __len) {
+  return (vector unsigned short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_load_len(const signed int *__ptr, unsigned int __len) {
+  return (vector signed int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_load_len(const unsigned int *__ptr, unsigned int __len) {
+  return (vector unsigned int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_len(const signed long long *__ptr, unsigned int __len) {
+  return (vector signed long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_len(const unsigned long long *__ptr, unsigned int __len) {
+  return (vector unsigned long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_load_len(const double *__ptr, unsigned int __len) {
+  return (vector double)__builtin_s390_vll(__len, __ptr);
+}
+
+/*-- vec_store_len ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed char __vec, signed char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned char __vec, unsigned char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed short __vec, signed short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned short __vec, unsigned short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed int __vec, signed int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned int __vec, unsigned int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed long long __vec, signed long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned long long __vec, unsigned long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector double __vec, double *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+/*-- vec_load_pair ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_pair(signed long long __a, signed long long __b) {
+  return (vector signed long long)(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_pair(unsigned long long __a, unsigned long long __b) {
+  return (vector unsigned long long)(__a, __b);
+}
+
+/*-- vec_genmask ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmask(unsigned short __mask)
+  __constant(__mask) {
+  return (vector unsigned char)(
+    __mask & 0x8000 ? 0xff : 0,
+    __mask & 0x4000 ? 0xff : 0,
+    __mask & 0x2000 ? 0xff : 0,
+    __mask & 0x1000 ? 0xff : 0,
+    __mask & 0x0800 ? 0xff : 0,
+    __mask & 0x0400 ? 0xff : 0,
+    __mask & 0x0200 ? 0xff : 0,
+    __mask & 0x0100 ? 0xff : 0,
+    __mask & 0x0080 ? 0xff : 0,
+    __mask & 0x0040 ? 0xff : 0,
+    __mask & 0x0020 ? 0xff : 0,
+    __mask & 0x0010 ? 0xff : 0,
+    __mask & 0x0008 ? 0xff : 0,
+    __mask & 0x0004 ? 0xff : 0,
+    __mask & 0x0002 ? 0xff : 0,
+    __mask & 0x0001 ? 0xff : 0);
+}
+
+/*-- vec_genmasks_* ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmasks_8(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 7;
+  unsigned char __bit2 = __last & 7;
+  unsigned char __mask1 = (unsigned char)(1U << (7 - __bit1) << 1) - 1;
+  unsigned char __mask2 = (unsigned char)(1U << (7 - __bit2)) - 1;
+  unsigned char __value = (__bit1 <= __bit2 ?
+                           __mask1 & ~__mask2 :
+                           __mask1 | ~__mask2);
+  return (vector unsigned char)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_genmasks_16(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 15;
+  unsigned char __bit2 = __last & 15;
+  unsigned short __mask1 = (unsigned short)(1U << (15 - __bit1) << 1) - 1;
+  unsigned short __mask2 = (unsigned short)(1U << (15 - __bit2)) - 1;
+  unsigned short __value = (__bit1 <= __bit2 ?
+                            __mask1 & ~__mask2 :
+                            __mask1 | ~__mask2);
+  return (vector unsigned short)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_genmasks_32(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 31;
+  unsigned char __bit2 = __last & 31;
+  unsigned int __mask1 = (1U << (31 - __bit1) << 1) - 1;
+  unsigned int __mask2 = (1U << (31 - __bit2)) - 1;
+  unsigned int __value = (__bit1 <= __bit2 ?
+                          __mask1 & ~__mask2 :
+                          __mask1 | ~__mask2);
+  return (vector unsigned int)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_genmasks_64(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 63;
+  unsigned char __bit2 = __last & 63;
+  unsigned long long __mask1 = (1ULL << (63 - __bit1) << 1) - 1;
+  unsigned long long __mask2 = (1ULL << (63 - __bit2)) - 1;
+  unsigned long long __value = (__bit1 <= __bit2 ?
+                                __mask1 & ~__mask2 :
+                                __mask1 | ~__mask2);
+  return (vector unsigned long long)__value;
+}
+
+/*-- vec_splat --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splat(vector signed char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector signed char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_splat(vector bool char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector bool char)(vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splat(vector unsigned char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splat(vector signed short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector signed short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_splat(vector bool short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector bool short)(vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splat(vector unsigned short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splat(vector signed int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector signed int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_splat(vector bool int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector bool int)(vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splat(vector unsigned int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splat(vector signed long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector signed long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_splat(vector bool long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector bool long long)(vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splat(vector unsigned long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splat(vector double __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector double)__vec[__index];
+}
+
+/*-- vec_splat_s* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector signed char
+vec_splat_s8(signed char __scalar)
+  __constant(__scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_ai vector signed short
+vec_splat_s16(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_ai vector signed int
+vec_splat_s32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector signed long long
+vec_splat_s64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed long long)(signed long)__scalar;
+}
+
+/*-- vec_splat_u* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_splat_u8(unsigned char __scalar)
+  __constant(__scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned short
+vec_splat_u16(unsigned short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned int
+vec_splat_u32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned long long
+vec_splat_u64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned long long)(signed long long)__scalar;
+}
+
+/*-- vec_splats -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splats(signed char __scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splats(unsigned char __scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splats(signed short __scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splats(unsigned short __scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splats(signed int __scalar) {
+  return (vector signed int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splats(unsigned int __scalar) {
+  return (vector unsigned int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splats(signed long long __scalar) {
+  return (vector signed long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splats(unsigned long long __scalar) {
+  return (vector unsigned long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splats(double __scalar) {
+  return (vector double)__scalar;
+}
+
+/*-- vec_extend_s64 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed char __a) {
+  return (vector signed long long)(__a[7], __a[15]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed short __a) {
+  return (vector signed long long)(__a[3], __a[7]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed int __a) {
+  return (vector signed long long)(__a[1], __a[3]);
+}
+
+/*-- vec_mergeh -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergeh(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergeh(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergeh(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergeh(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergeh(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergeh(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergeh(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergeh(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergeh(vector double __a, vector double __b) {
+  return (vector double)(__a[0], __b[0]);
+}
+
+/*-- vec_mergel -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergel(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergel(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergel(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergel(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergel(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergel(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergel(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergel(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergel(vector double __a, vector double __b) {
+  return (vector double)(__a[1], __b[1]);
+}
+
+/*-- vec_pack ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_pack(vector signed short __a, vector signed short __b) {
+  vector signed char __ac = (vector signed char)__a;
+  vector signed char __bc = (vector signed char)__b;
+  return (vector signed char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_pack(vector bool short __a, vector bool short __b) {
+  vector bool char __ac = (vector bool char)__a;
+  vector bool char __bc = (vector bool char)__b;
+  return (vector bool char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_pack(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return (vector unsigned char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_pack(vector signed int __a, vector signed int __b) {
+  vector signed short __ac = (vector signed short)__a;
+  vector signed short __bc = (vector signed short)__b;
+  return (vector signed short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_pack(vector bool int __a, vector bool int __b) {
+  vector bool short __ac = (vector bool short)__a;
+  vector bool short __bc = (vector bool short)__b;
+  return (vector bool short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_pack(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return (vector unsigned short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_pack(vector signed long long __a, vector signed long long __b) {
+  vector signed int __ac = (vector signed int)__a;
+  vector signed int __bc = (vector signed int)__b;
+  return (vector signed int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_pack(vector bool long long __a, vector bool long long __b) {
+  vector bool int __ac = (vector bool int)__a;
+  vector bool int __bc = (vector bool int)__b;
+  return (vector bool int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return (vector unsigned int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+/*-- vec_packs --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vpksh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vpksf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vpksg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packs_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return __builtin_s390_vpkshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return __builtin_s390_vpksfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs_cc(vector signed long long __a, vector signed long long __b,
+             int *__cc) {
+  return __builtin_s390_vpksgs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs_cc(vector unsigned long long __a, vector unsigned long long __b,
+             int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_packsu -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector signed short __a, vector signed short __b) {
+  const vector signed short __zero = (vector signed short)0;
+  return __builtin_s390_vpklsh(
+    (vector unsigned short)(__a >= __zero) & (vector unsigned short)__a,
+    (vector unsigned short)(__b >= __zero) & (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector signed int __a, vector signed int __b) {
+  const vector signed int __zero = (vector signed int)0;
+  return __builtin_s390_vpklsf(
+    (vector unsigned int)(__a >= __zero) & (vector unsigned int)__a,
+    (vector unsigned int)(__b >= __zero) & (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector signed long long __a, vector signed long long __b) {
+  const vector signed long long __zero = (vector signed long long)0;
+  return __builtin_s390_vpklsg(
+    (vector unsigned long long)(__a >= __zero) &
+    (vector unsigned long long)__a,
+    (vector unsigned long long)(__b >= __zero) &
+    (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packsu_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu_cc(vector unsigned long long __a, vector unsigned long long __b,
+              int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_unpackh ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackh(vector signed char __a) {
+  return __builtin_s390_vuphb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackh(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuphb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackh(vector unsigned char __a) {
+  return __builtin_s390_vuplhb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackh(vector signed short __a) {
+  return __builtin_s390_vuphh(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackh(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuphh((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackh(vector unsigned short __a) {
+  return __builtin_s390_vuplhh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackh(vector signed int __a) {
+  return __builtin_s390_vuphf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackh(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuphf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackh(vector unsigned int __a) {
+  return __builtin_s390_vuplhf(__a);
+}
+
+/*-- vec_unpackl ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackl(vector signed char __a) {
+  return __builtin_s390_vuplb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackl(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuplb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackl(vector unsigned char __a) {
+  return __builtin_s390_vupllb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackl(vector signed short __a) {
+  return __builtin_s390_vuplhw(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackl(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuplhw((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackl(vector unsigned short __a) {
+  return __builtin_s390_vupllh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackl(vector signed int __a) {
+  return __builtin_s390_vuplf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackl(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuplf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackl(vector unsigned int __a) {
+  return __builtin_s390_vupllf(__a);
+}
+
+/*-- vec_cmpeq --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+/*-- vec_cmpge --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+/*-- vec_cmpgt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+/*-- vec_cmple --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector double __a, vector double __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+/*-- vec_cmplt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+/*-- vec_all_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_any_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 3;
+}
+
+/*-- vec_any_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_andc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_andc(vector bool char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector bool char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector bool char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_andc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector bool short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector bool short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_andc(vector bool int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector bool int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector bool int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_andc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector bool long long __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+/*-- vec_nor ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_nor(vector bool char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector bool char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector bool char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_nor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector bool short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector bool short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_nor(vector bool int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector bool int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector bool int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_nor(vector bool long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector bool long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector bool long long __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector bool long long __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+/*-- vec_cntlz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector signed char __a) {
+  return __builtin_s390_vclzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector unsigned char __a) {
+  return __builtin_s390_vclzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector signed short __a) {
+  return __builtin_s390_vclzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector unsigned short __a) {
+  return __builtin_s390_vclzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector signed int __a) {
+  return __builtin_s390_vclzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector unsigned int __a) {
+  return __builtin_s390_vclzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector signed long long __a) {
+  return __builtin_s390_vclzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_s390_vclzg(__a);
+}
+
+/*-- vec_cnttz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector signed char __a) {
+  return __builtin_s390_vctzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_s390_vctzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector signed short __a) {
+  return __builtin_s390_vctzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_s390_vctzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector signed int __a) {
+  return __builtin_s390_vctzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_s390_vctzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector signed long long __a) {
+  return __builtin_s390_vctzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_s390_vctzg(__a);
+}
+
+/*-- vec_popcnt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector signed char __a) {
+  return __builtin_s390_vpopctb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_s390_vpopctb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector signed short __a) {
+  return __builtin_s390_vpopcth((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_s390_vpopcth(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector signed int __a) {
+  return __builtin_s390_vpopctf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_s390_vpopctf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector signed long long __a) {
+  return __builtin_s390_vpopctg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_s390_vpopctg(__a);
+}
+
+/*-- vec_rl -----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_verllvb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_verllvb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_verllvh(
+    (vector unsigned short)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rl(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_verllvh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_verllvf(
+    (vector unsigned int)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rl(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_verllvf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_verllvg(
+    (vector unsigned long long)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_verllvg(__a, __b);
+}
+
+/*-- vec_rli ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rli(vector signed char __a, unsigned long __b) {
+  return (vector signed char)__builtin_s390_verllb(
+    (vector unsigned char)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rli(vector unsigned char __a, unsigned long __b) {
+  return __builtin_s390_verllb(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rli(vector signed short __a, unsigned long __b) {
+  return (vector signed short)__builtin_s390_verllh(
+    (vector unsigned short)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rli(vector unsigned short __a, unsigned long __b) {
+  return __builtin_s390_verllh(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rli(vector signed int __a, unsigned long __b) {
+  return (vector signed int)__builtin_s390_verllf(
+    (vector unsigned int)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rli(vector unsigned int __a, unsigned long __b) {
+  return __builtin_s390_verllf(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rli(vector signed long long __a, unsigned long __b) {
+  return (vector signed long long)__builtin_s390_verllg(
+    (vector unsigned long long)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rli(vector unsigned long long __a, unsigned long __b) {
+  return __builtin_s390_verllg(__a, (int)__b);
+}
+
+/*-- vec_rl_mask ------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_rl_mask(vector signed char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned char
+vec_rl_mask(vector unsigned char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed short
+vec_rl_mask(vector signed short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned short
+vec_rl_mask(vector unsigned short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed int
+vec_rl_mask(vector signed int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned int
+vec_rl_mask(vector unsigned int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed long long
+vec_rl_mask(vector signed long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned long long
+vec_rl_mask(vector unsigned long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+#define vec_rl_mask(X, Y, Z) ((__typeof__((vec_rl_mask)((X), (Y), (Z)))) \
+  __extension__ ({ \
+    vector unsigned char __res; \
+    vector unsigned char __x = (vector unsigned char)(X); \
+    vector unsigned char __y = (vector unsigned char)(Y); \
+    switch (sizeof ((X)[0])) { \
+    case 1: __res = (vector unsigned char) __builtin_s390_verimb( \
+             (vector unsigned char)__x, (vector unsigned char)__x, \
+             (vector unsigned char)__y, (Z)); break; \
+    case 2: __res = (vector unsigned char) __builtin_s390_verimh( \
+             (vector unsigned short)__x, (vector unsigned short)__x, \
+             (vector unsigned short)__y, (Z)); break; \
+    case 4: __res = (vector unsigned char) __builtin_s390_verimf( \
+             (vector unsigned int)__x, (vector unsigned int)__x, \
+             (vector unsigned int)__y, (Z)); break; \
+    default: __res = (vector unsigned char) __builtin_s390_verimg( \
+             (vector unsigned long long)__x, (vector unsigned long long)__x, \
+             (vector unsigned long long)__y, (Z)); break; \
+    } __res; }))
+
+/*-- vec_sll ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_slb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vslb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vslb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_sld ----------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sld(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned char
+vec_sld(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed short
+vec_sld(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned short
+vec_sld(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed int
+vec_sld(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned int
+vec_sld(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed long long
+vec_sld(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned long long
+vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector double
+vec_sld(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 15);
+
+#define vec_sld(X, Y, Z) ((__typeof__((vec_sld)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z)))
+
+/*-- vec_sldw ---------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sldw(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned char
+vec_sldw(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed short
+vec_sldw(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned short
+vec_sldw(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed int
+vec_sldw(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned int
+vec_sldw(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed long long
+vec_sldw(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_sldw(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_sldw(X, Y, Z) ((__typeof__((vec_sldw)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z) * 4))
+
+/*-- vec_sral ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsra(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srab ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrab(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrab(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srl ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrlb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrlb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_abs ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_abs(vector signed char __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed char)0));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_abs(vector signed short __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed short)0));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_abs(vector signed int __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed int)0));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_abs(vector signed long long __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_abs(vector double __a) {
+  return __builtin_s390_vflpdb(__a);
+}
+
+/*-- vec_nabs ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_nabs(vector double __a) {
+  return __builtin_s390_vflndb(__a);
+}
+
+/*-- vec_max ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector signed char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector signed short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector signed int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_max(vector double __a, vector double __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_min ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector signed char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector signed short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector signed int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_min(vector double __a, vector double __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_add_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_add_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaq(__a, __b);
+}
+
+/*-- vec_addc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_addc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_addc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vacch(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_addc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vaccf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_addc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vaccg(__a, __b);
+}
+
+/*-- vec_addc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccq(__a, __b);
+}
+
+/*-- vec_adde_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_adde_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vacq(__a, __b, __c);
+}
+
+/*-- vec_addec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vacccq(__a, __b, __c);
+}
+
+/*-- vec_avg ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_avg(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vavgb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_avg(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vavgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_avg(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vavgf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_avg(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vavgg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_avg(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vavglb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_avg(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vavglh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_avg(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vavglf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_avg(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vavglg(__a, __b);
+}
+
+/*-- vec_checksum -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned int
+vec_checksum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vcksm(__a, __b);
+}
+
+/*-- vec_gfmsum -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vgfmb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vgfmh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vgfmf(__a, __b);
+}
+
+/*-- vec_gfmsum_128 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vgfmg(__a, __b);
+}
+
+/*-- vec_gfmsum_accum -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum_accum(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned short __c) {
+  return __builtin_s390_vgfmab(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum_accum(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned int __c) {
+  return __builtin_s390_vgfmah(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum_accum(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned long long __c) {
+  return __builtin_s390_vgfmaf(__a, __b, __c);
+}
+
+/*-- vec_gfmsum_accum_128 ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_accum_128(vector unsigned long long __a,
+                     vector unsigned long long __b,
+                     vector unsigned char __c) {
+  return __builtin_s390_vgfmag(__a, __b, __c);
+}
+
+/*-- vec_mladd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector unsigned char __a, vector signed char __b,
+          vector signed char __c) {
+  return (vector signed char)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * (vector signed char)__b + (vector signed char)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mladd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector unsigned short __a, vector signed short __b,
+          vector signed short __c) {
+  return (vector signed short)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * (vector signed short)__b + (vector signed short)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mladd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector unsigned int __a, vector signed int __b,
+          vector signed int __c) {
+  return (vector signed int)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * (vector signed int)__b + (vector signed int)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mladd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * __b + __c;
+}
+
+/*-- vec_mhadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mhadd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __builtin_s390_vmahb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mhadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __builtin_s390_vmalhb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mhadd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_s390_vmahh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mhadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalhh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mhadd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __builtin_s390_vmahf(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mhadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmalhf(__a, __b, __c);
+}
+
+/*-- vec_meadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_meadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaeb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_meadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmaleb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_meadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaeh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_meadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaleh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_meadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaef(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_meadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalef(__a, __b, __c);
+}
+
+/*-- vec_moadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_moadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_moadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_moadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaoh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_moadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaloh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_moadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaof(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_moadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalof(__a, __b, __c);
+}
+
+/*-- vec_mulh ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mulh(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mulh(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulh(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmlhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulh(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmhf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulh(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlhf(__a, __b);
+}
+
+/*-- vec_mule ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mule(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mule(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmleb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mule(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mule(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmleh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mule(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmef(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlef(__a, __b);
+}
+
+/*-- vec_mulo ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulo(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulo(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulo(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmoh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulo(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmloh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mulo(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmof(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlof(__a, __b);
+}
+
+/*-- vec_sub_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sub_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsq(__a, __b);
+}
+
+/*-- vec_subc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_subc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbib(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_subc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vscbih(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_subc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vscbif(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_subc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vscbig(__a, __b);
+}
+
+/*-- vec_subc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbiq(__a, __b);
+}
+
+/*-- vec_sube_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sube_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vsbiq(__a, __b, __c);
+}
+
+/*-- vec_subec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vsbcbiq(__a, __b, __c);
+}
+
+/*-- vec_sum2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumgf(__a, __b);
+}
+
+/*-- vec_sum_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumqf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vsumqg(__a, __b);
+}
+
+/*-- vec_sum4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsumb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumh(__a, __b);
+}
+
+/*-- vec_test_mask ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm(__a, __b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector double __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+/*-- vec_madd ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_madd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmadb(__a, __b, __c);
+}
+
+/*-- vec_msub ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_msub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmsdb(__a, __b, __c);
+}
+
+/*-- vec_sqrt ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_sqrt(vector double __a) {
+  return __builtin_s390_vfsqdb(__a);
+}
+
+/*-- vec_ld2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ld2f(const float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  return __builtin_convertvector(*(const __v2f32 *)__ptr, vector double);
+}
+
+/*-- vec_st2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai void
+vec_st2f(vector double __a, float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  *(__v2f32 *)__ptr = __builtin_convertvector(__a, __v2f32);
+}
+
+/*-- vec_ctd ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector signed long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector unsigned long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+/*-- vec_ctsl ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_ctsl(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+/*-- vec_ctul ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_ctul(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+/*-- vec_roundp -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundp(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_ceil ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ceil(vector double __a) {
+  // On this platform, vec_ceil never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_roundm -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundm(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_floor --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_floor(vector double __a) {
+  // On this platform, vec_floor never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_roundz -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundz(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_trunc --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_trunc(vector double __a) {
+  // On this platform, vec_trunc never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_roundc -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundc(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 0);
+}
+
+/*-- vec_round --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_round(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 4);
+}
+
+/*-- vec_fp_test_data_class -------------------------------------------------*/
+
+#define vec_fp_test_data_class(X, Y, Z) \
+  ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z)))
+
+/*-- vec_cp_until_zero ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero(vector signed char __a) {
+  return (vector signed char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero(vector bool char __a) {
+  return (vector bool char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero(vector unsigned char __a) {
+  return __builtin_s390_vistrb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero(vector signed short __a) {
+  return (vector signed short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero(vector bool short __a) {
+  return (vector bool short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero(vector unsigned short __a) {
+  return __builtin_s390_vistrh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero(vector signed int __a) {
+  return (vector signed int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero(vector bool int __a) {
+  return (vector bool int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero(vector unsigned int __a) {
+  return __builtin_s390_vistrf(__a);
+}
+
+/*-- vec_cp_until_zero_cc ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero_cc(vector signed char __a, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero_cc(vector bool char __a, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero_cc(vector unsigned char __a, int *__cc) {
+  return __builtin_s390_vistrbs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero_cc(vector signed short __a, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero_cc(vector bool short __a, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero_cc(vector unsigned short __a, int *__cc) {
+  return __builtin_s390_vistrhs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero_cc(vector signed int __a, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vistrfs((vector unsigned int)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero_cc(vector bool int __a, int *__cc) {
+  return (vector bool int)__builtin_s390_vistrfs((vector unsigned int)__a,
+                                                 __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero_cc(vector unsigned int __a, int *__cc) {
+  return __builtin_s390_vistrfs(__a, __cc);
+}
+
+/*-- vec_cmpeq_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeeb((vector unsigned char)__a,
+                         (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeeb((vector unsigned char)__a,
+                              (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeeh((vector unsigned short)__a,
+                         (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeeh((vector unsigned short)__a,
+                              (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeef((vector unsigned int)__a,
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeef((vector unsigned int)__a,
+                              (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeef(__a, __b);
+}
+
+/*-- vec_cmpeq_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfeebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfeehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfeefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpeq_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeezb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeezb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeezh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeezh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeezf((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeezf((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeezf(__a, __b);
+}
+
+/*-- vec_cmpeq_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeneb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeneb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeneb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeneh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeneh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeneh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenef((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenef((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenef(__a, __b);
+}
+
+/*-- vec_cmpne_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenebs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenebs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfenebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenehs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenehs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfenehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenefs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenefs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfenefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfenezb((vector unsigned char)__a,
+                           (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfenezb((vector unsigned char)__a,
+                                (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfenezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfenezh((vector unsigned short)__a,
+                           (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfenezh((vector unsigned short)__a,
+                                (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfenezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenezf((vector unsigned int)__a,
+                           (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenezf((vector unsigned int)__a,
+                                (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenezf(__a, __b);
+}
+
+/*-- vec_cmpne_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenezbs((vector unsigned char)__a,
+                            (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenezbs((vector unsigned char)__a,
+                                 (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenezhs((vector unsigned short)__a,
+                            (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenezhs((vector unsigned short)__a,
+                                 (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenezfs((vector unsigned int)__a,
+                            (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenezfs((vector unsigned int)__a,
+                                 (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmprg --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 4);
+}
+
+/*-- vec_cmprg_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg_cc(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg_cc(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg_cc(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 4, __cc);
+}
+
+/*-- vec_cmprg_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmprg_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                   vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                   vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                   vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmpnrg -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg(vector unsigned char __a, vector unsigned char __b,
+           vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg(vector unsigned short __a, vector unsigned short __b,
+           vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg(vector unsigned int __a, vector unsigned int __b,
+           vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 12);
+}
+
+/*-- vec_cmpnrg_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg_cc(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg_cc(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg_cc(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 12, __cc);
+}
+
+/*-- vec_cmpnrg_idx ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx(vector unsigned short __a, vector unsigned short __b,
+               vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx(vector unsigned int __a, vector unsigned int __b,
+               vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_idx_cc ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                  vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                  vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                  vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_cmpnrg_or_0_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                    vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                    vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                    vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_or_0_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_find_any_eq --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 4);
+}
+
+/*-- vec_find_any_eq_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 4, __cc);
+}
+
+/*-- vec_find_any_eq_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_eq_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_ne --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 12);
+}
+
+/*-- vec_find_any_ne_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 12, __cc);
+}
+
+/*-- vec_find_any_ne_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 8, __cc);
+}
+
+/*-- vec_find_any_ne_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 8, __cc);
+}
+
+#undef __constant_pow2_range
+#undef __constant_range
+#undef __constant
+#undef __ATTRS_o
+#undef __ATTRS_o_ai
+#undef __ATTRS_ai
+
+#else
+
+#error "Use -fzvector to enable vector extensions"
+
+#endif
diff --git a/renderscript/clang-include/wmmintrin.h b/renderscript/clang-include/wmmintrin.h
index 369e3c2..a2d9310 100644
--- a/renderscript/clang-include/wmmintrin.h
+++ b/renderscript/clang-include/wmmintrin.h
@@ -26,17 +26,8 @@
 
 #include <emmintrin.h>
 
-#if !defined (__AES__) && !defined (__PCLMUL__)
-# error "AES/PCLMUL instructions not enabled"
-#else
-
-#ifdef __AES__
 #include <__wmmintrin_aes.h>
-#endif /* __AES__ */
 
-#ifdef __PCLMUL__
 #include <__wmmintrin_pclmul.h>
-#endif /* __PCLMUL__ */
 
-#endif /* __AES__ || __PCLMUL__ */
 #endif /* _WMMINTRIN_H */
diff --git a/renderscript/clang-include/x86intrin.h b/renderscript/clang-include/x86intrin.h
index 21a43da..4d8077e 100644
--- a/renderscript/clang-include/x86intrin.h
+++ b/renderscript/clang-include/x86intrin.h
@@ -28,53 +28,29 @@
 
 #include <immintrin.h>
 
-#ifdef __3dNOW__
 #include <mm3dnow.h>
-#endif
 
-#ifdef __BMI__
 #include <bmiintrin.h>
-#endif
 
-#ifdef __BMI2__
 #include <bmi2intrin.h>
-#endif
 
-#ifdef __LZCNT__
 #include <lzcntintrin.h>
-#endif
 
-#ifdef __POPCNT__
 #include <popcntintrin.h>
-#endif
 
-#ifdef __RDSEED__
 #include <rdseedintrin.h>
-#endif
 
-#ifdef __PRFCHW__
 #include <prfchwintrin.h>
-#endif
 
-#ifdef __SSE4A__
 #include <ammintrin.h>
-#endif
 
-#ifdef __FMA4__
 #include <fma4intrin.h>
-#endif
 
-#ifdef __XOP__
 #include <xopintrin.h>
-#endif
 
-#ifdef __TBM__
 #include <tbmintrin.h>
-#endif
 
-#ifdef __F16C__
 #include <f16cintrin.h>
-#endif
 
 /* FIXME: LWP */
 
diff --git a/renderscript/clang-include/xmmintrin.h b/renderscript/clang-include/xmmintrin.h
index 3a6b95e..ae0b2cd 100644
--- a/renderscript/clang-include/xmmintrin.h
+++ b/renderscript/clang-include/xmmintrin.h
@@ -20,13 +20,9 @@
  *
  *===-----------------------------------------------------------------------===
  */
- 
+
 #ifndef __XMMINTRIN_H
 #define __XMMINTRIN_H
- 
-#ifndef __SSE__
-#error "SSE instruction set not enabled"
-#else
 
 #include <mmintrin.h>
 
@@ -40,182 +36,185 @@
 #include <mm_malloc.h>
 #endif
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_add_ss(__m128 __a, __m128 __b)
 {
   __a[0] += __b[0];
   return __a;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_add_ps(__m128 __a, __m128 __b)
 {
   return __a + __b;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sub_ss(__m128 __a, __m128 __b)
 {
   __a[0] -= __b[0];
   return __a;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sub_ps(__m128 __a, __m128 __b)
 {
   return __a - __b;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mul_ss(__m128 __a, __m128 __b)
 {
   __a[0] *= __b[0];
   return __a;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mul_ps(__m128 __a, __m128 __b)
 {
   return __a * __b;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_div_ss(__m128 __a, __m128 __b)
 {
   __a[0] /= __b[0];
   return __a;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_div_ps(__m128 __a, __m128 __b)
 {
   return __a / __b;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sqrt_ss(__m128 __a)
 {
   __m128 __c = __builtin_ia32_sqrtss(__a);
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sqrt_ps(__m128 __a)
 {
   return __builtin_ia32_sqrtps(__a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp_ss(__m128 __a)
 {
   __m128 __c = __builtin_ia32_rcpss(__a);
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp_ps(__m128 __a)
 {
   return __builtin_ia32_rcpps(__a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt_ss(__m128 __a)
 {
   __m128 __c = __builtin_ia32_rsqrtss(__a);
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt_ps(__m128 __a)
 {
   return __builtin_ia32_rsqrtps(__a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_min_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_minss(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_min_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_minps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_max_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_maxss(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_max_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_maxps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_and_ps(__m128 __a, __m128 __b)
 {
   return (__m128)((__v4si)__a & (__v4si)__b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_andnot_ps(__m128 __a, __m128 __b)
 {
   return (__m128)(~(__v4si)__a & (__v4si)__b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_or_ps(__m128 __a, __m128 __b)
 {
   return (__m128)((__v4si)__a | (__v4si)__b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_xor_ps(__m128 __a, __m128 __b)
 {
   return (__m128)((__v4si)__a ^ (__v4si)__b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpeq_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpeqss(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpeq_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpeqps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmplt_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpltss(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmplt_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpltps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmple_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpless(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmple_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpleps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpgt_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_shufflevector(__a,
@@ -223,13 +222,13 @@
                                          4, 1, 2, 3);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpgt_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpltps(__b, __a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpge_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_shufflevector(__a,
@@ -237,49 +236,49 @@
                                          4, 1, 2, 3);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpge_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpleps(__b, __a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpneq_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpneqss(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpneq_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpneqps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnltss(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnltps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnle_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnless(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnle_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnleps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpngt_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_shufflevector(__a,
@@ -287,13 +286,13 @@
                                          4, 1, 2, 3);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpngt_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnltps(__b, __a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnge_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_shufflevector(__a,
@@ -301,115 +300,115 @@
                                          4, 1, 2, 3);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnge_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnleps(__b, __a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpord_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpordss(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpord_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpordps(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpunord_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpunordss(__a, __b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpunord_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpunordps(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comieq(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comilt(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comile(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comigt(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comige(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comineq(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomieq(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomilt(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomile(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomigt(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomige(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomineq(__a, __b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtss_si32(__m128 __a)
 {
   return __builtin_ia32_cvtss2si(__a);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvt_ss2si(__m128 __a)
 {
   return _mm_cvtss_si32(__a);
@@ -417,7 +416,7 @@
 
 #ifdef __x86_64__
 
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtss_si64(__m128 __a)
 {
   return __builtin_ia32_cvtss2si64(__a);
@@ -425,56 +424,56 @@
 
 #endif
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi32(__m128 __a)
 {
   return (__m64)__builtin_ia32_cvtps2pi(__a);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvt_ps2pi(__m128 __a)
 {
   return _mm_cvtps_pi32(__a);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttss_si32(__m128 __a)
 {
   return __a[0];
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtt_ss2si(__m128 __a)
 {
   return _mm_cvttss_si32(__a);
 }
 
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttss_si64(__m128 __a)
 {
   return __a[0];
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttps_pi32(__m128 __a)
 {
   return (__m64)__builtin_ia32_cvttps2pi(__a);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtt_ps2pi(__m128 __a)
 {
   return _mm_cvttps_pi32(__a);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsi32_ss(__m128 __a, int __b)
 {
   __a[0] = __b;
   return __a;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvt_si2ss(__m128 __a, int __b)
 {
   return _mm_cvtsi32_ss(__a, __b);
@@ -482,7 +481,7 @@
 
 #ifdef __x86_64__
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsi64_ss(__m128 __a, long long __b)
 {
   __a[0] = __b;
@@ -491,25 +490,25 @@
 
 #endif
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
   return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
   return _mm_cvtpi32_ps(__a, __b);
 }
 
-static __inline__ float __attribute__((__always_inline__, __nodebug__))
+static __inline__ float __DEFAULT_FN_ATTRS
 _mm_cvtss_f32(__m128 __a)
 {
   return __a[0];
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadh_pi(__m128 __a, const __m64 *__p)
 {
   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
@@ -521,7 +520,7 @@
   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadl_pi(__m128 __a, const __m64 *__p)
 {
   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
@@ -533,7 +532,7 @@
   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ss(const float *__p)
 {
   struct __mm_load_ss_struct {
@@ -543,7 +542,7 @@
   return (__m128){ __u, 0, 0, 0 };
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load1_ps(const float *__p)
 {
   struct __mm_load1_ps_struct {
@@ -555,13 +554,13 @@
 
 #define        _mm_load_ps1(p) _mm_load1_ps(p)
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ps(const float *__p)
 {
   return *(__m128*)__p;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadu_ps(const float *__p)
 {
   struct __loadu_ps {
@@ -570,63 +569,69 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadr_ps(const float *__p)
 {
   __m128 __a = _mm_load_ps(__p);
   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_undefined_ps()
+{
+  return (__m128)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ss(float __w)
 {
   return (__m128){ __w, 0, 0, 0 };
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set1_ps(float __w)
 {
   return (__m128){ __w, __w, __w, __w };
 }
 
 /* Microsoft specific. */
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ps1(float __w)
 {
     return _mm_set1_ps(__w);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ps(float __z, float __y, float __x, float __w)
 {
   return (__m128){ __w, __x, __y, __z };
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setr_ps(float __z, float __y, float __x, float __w)
 {
   return (__m128){ __z, __y, __x, __w };
 }
 
-static __inline__ __m128 __attribute__((__always_inline__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setzero_ps(void)
 {
   return (__m128){ 0, 0, 0, 0 };
 }
 
-static __inline__ void __attribute__((__always_inline__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeh_pi(__m64 *__p, __m128 __a)
 {
   __builtin_ia32_storehps((__v2si *)__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_pi(__m64 *__p, __m128 __a)
 {
   __builtin_ia32_storelps((__v2si *)__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ss(float *__p, __m128 __a)
 {
   struct __mm_store_ss_struct {
@@ -635,32 +640,32 @@
   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_ps(float *__p, __m128 __a)
 {
   __builtin_ia32_storeups(__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_ps(float *__p, __m128 __a)
 {
   __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
   _mm_storeu_ps(__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps1(float *__p, __m128 __a)
 {
     return _mm_store1_ps(__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps(float *__p, __m128 __a)
 {
   *(__m128 *)__p = __a;
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storer_ps(float *__p, __m128 __a)
 {
   __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
@@ -679,32 +684,32 @@
 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
 #endif
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(__m64 *__p, __m64 __a)
 {
   __builtin_ia32_movntq(__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_ps(float *__p, __m128 __a)
 {
   __builtin_ia32_movntps(__p, __a);
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_sfence(void)
 {
   __builtin_ia32_sfence();
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_extract_pi16(__m64 __a, int __n)
 {
   __v4hi __b = (__v4hi)__a;
   return (unsigned short)__b[__n & 3];
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_insert_pi16(__m64 __a, int __d, int __n)
 {
    __v4hi __b = (__v4hi)__a;
@@ -712,121 +717,118 @@
    return (__m64)__b;
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pi8(__m64 __a)
 {
   return __builtin_ia32_pmovmskb((__v8qi)__a);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
 }
 
 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
-  __m64 __a = (a); \
-  (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
+  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_getcsr(void)
 {
   return __builtin_ia32_stmxcsr();
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_setcsr(unsigned int __i)
 {
   __builtin_ia32_ldmxcsr(__i);
 }
 
 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
-  __m128 __a = (a); \
-  __m128 __b = (b); \
-  (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
                                   (mask) & 0x3, ((mask) & 0xc) >> 2, \
                                   (((mask) & 0x30) >> 4) + 4, \
                                   (((mask) & 0xc0) >> 6) + 4); })
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_unpackhi_ps(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_unpacklo_ps(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_move_ss(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movehl_ps(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movelh_ps(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi16_ps(__m64 __a)
 {
   __m64 __b, __c;
@@ -844,7 +846,7 @@
   return __r;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpu16_ps(__m64 __a)
 {
   __m64 __b, __c;
@@ -861,11 +863,11 @@
   return __r;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi8_ps(__m64 __a)
 {
   __m64 __b;
-  
+
   __b = _mm_setzero_si64();
   __b = _mm_cmpgt_pi8(__b, __a);
   __b = _mm_unpacklo_pi8(__a, __b);
@@ -873,22 +875,22 @@
   return _mm_cvtpi16_ps(__b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpu8_ps(__m64 __a)
 {
   __m64 __b;
-  
+
   __b = _mm_setzero_si64();
   __b = _mm_unpacklo_pi8(__a, __b);
 
   return _mm_cvtpi16_ps(__b);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
   __m128 __c;
-  
+
   __c = _mm_setzero_ps();
   __c = _mm_cvtpi32_ps(__c, __b);
   __c = _mm_movelh_ps(__c, __c);
@@ -896,35 +898,40 @@
   return _mm_cvtpi32_ps(__c, __a);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi16(__m128 __a)
 {
   __m64 __b, __c;
-  
+
   __b = _mm_cvtps_pi32(__a);
   __a = _mm_movehl_ps(__a, __a);
   __c = _mm_cvtps_pi32(__a);
-  
+
   return _mm_packs_pi32(__b, __c);
 }
 
-static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi8(__m128 __a)
 {
   __m64 __b, __c;
-  
+
   __b = _mm_cvtps_pi16(__a);
   __c = _mm_setzero_si64();
-  
+
   return _mm_packs_pi16(__b, __c);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_ps(__m128 __a)
 {
   return __builtin_ia32_movmskps(__a);
 }
 
+
+#ifdef _MSC_VER
+#define _MM_ALIGN16 __declspec(align(16))
+#endif
+
 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
 
 #define _MM_EXCEPT_INVALID    (0x0001)
@@ -993,11 +1000,11 @@
 #define _m_ _mm_
 #define _m_ _mm_
 
+#undef __DEFAULT_FN_ATTRS
+
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
 #if defined(__SSE2__) && !__has_feature(modules)
 #include <emmintrin.h>
 #endif
 
-#endif /* __SSE__ */
-
 #endif /* __XMMINTRIN_H */
diff --git a/renderscript/clang-include/xopintrin.h b/renderscript/clang-include/xopintrin.h
index cc94ca0..f07f51c 100644
--- a/renderscript/clang-include/xopintrin.h
+++ b/renderscript/clang-include/xopintrin.h
@@ -28,319 +28,306 @@
 #ifndef __XOPINTRIN_H
 #define __XOPINTRIN_H
 
-#ifndef __XOP__
-# error "XOP instruction set is not enabled"
-#else
-
 #include <fma4intrin.h>
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddw_epi8(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddd_epi8(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddq_epi8(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddd_epi16(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddq_epi16(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddq_epi32(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddw_epu8(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddd_epu8(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddq_epu8(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddd_epu16(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddq_epu16(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_haddq_epu32(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsubw_epi8(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsubd_epi16(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsubq_epi32(__m128i __A)
 {
   return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
 }
 
-static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
 {
   return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
 {
   return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_rot_epi8(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_rot_epi16(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_rot_epi32(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_rot_epi64(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
 }
 
 #define _mm_roti_epi8(A, N) __extension__ ({ \
-  __m128i __A = (A); \
-  (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); })
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
 
 #define _mm_roti_epi16(A, N) __extension__ ({ \
-  __m128i __A = (A); \
-  (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); })
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
 
 #define _mm_roti_epi32(A, N) __extension__ ({ \
-  __m128i __A = (A); \
-  (__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); })
+  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
 
 #define _mm_roti_epi64(A, N) __extension__ ({ \
-  __m128i __A = (A); \
-  (__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); })
+  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shl_epi8(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shl_epi16(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shl_epi32(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shl_epi64(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha_epi8(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha_epi16(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha_epi32(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha_epi64(__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
 }
 
 #define _mm_com_epu8(A, B, N) __extension__ ({ \
-  __m128i __A = (A); \
-  __m128i __B = (B); \
-  (__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); })
+  (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
+                                  (__v16qi)(__m128i)(B), (N)); })
 
 #define _mm_com_epu16(A, B, N) __extension__ ({ \
-  __m128i __A = (A); \
-  __m128i __B = (B); \
-  (__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); })
+  (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
+                                  (__v8hi)(__m128i)(B), (N)); })
 
 #define _mm_com_epu32(A, B, N) __extension__ ({ \
-  __m128i __A = (A); \
-  __m128i __B = (B); \
-  (__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); })
+  (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
+                                  (__v4si)(__m128i)(B), (N)); })
 
 #define _mm_com_epu64(A, B, N) __extension__ ({ \
-  __m128i __A = (A); \
-  __m128i __B = (B); \
-  (__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); })
+  (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
+                                  (__v2di)(__m128i)(B), (N)); })
 
 #define _mm_com_epi8(A, B, N) __extension__ ({ \
-  __m128i __A = (A); \
-  __m128i __B = (B); \
-  (__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); })
+  (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
+                                 (__v16qi)(__m128i)(B), (N)); })
 
 #define _mm_com_epi16(A, B, N) __extension__ ({ \
-  __m128i __A = (A); \
-  __m128i __B = (B); \
-  (__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); })
+  (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
+                                 (__v8hi)(__m128i)(B), (N)); })
 
 #define _mm_com_epi32(A, B, N) __extension__ ({ \
-  __m128i __A = (A); \
-  __m128i __B = (B); \
-  (__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); })
+  (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
+                                 (__v4si)(__m128i)(B), (N)); })
 
 #define _mm_com_epi64(A, B, N) __extension__ ({ \
-  __m128i __A = (A); \
-  __m128i __B = (B); \
-  (__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); })
+  (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
+                                 (__v2di)(__m128i)(B), (N)); })
 
 #define _MM_PCOMCTRL_LT    0
 #define _MM_PCOMCTRL_LE    1
@@ -351,454 +338,445 @@
 #define _MM_PCOMCTRL_FALSE 6
 #define _MM_PCOMCTRL_TRUE  7
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comlt_epu8(__m128i __A, __m128i __B)
 {
   return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comle_epu8(__m128i __A, __m128i __B)
 {
   return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comgt_epu8(__m128i __A, __m128i __B)
 {
   return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comge_epu8(__m128i __A, __m128i __B)
 {
   return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comeq_epu8(__m128i __A, __m128i __B)
 {
   return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comneq_epu8(__m128i __A, __m128i __B)
 {
   return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comfalse_epu8(__m128i __A, __m128i __B)
 {
   return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comtrue_epu8(__m128i __A, __m128i __B)
 {
   return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comlt_epu16(__m128i __A, __m128i __B)
 {
   return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comle_epu16(__m128i __A, __m128i __B)
 {
   return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comgt_epu16(__m128i __A, __m128i __B)
 {
   return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comge_epu16(__m128i __A, __m128i __B)
 {
   return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comeq_epu16(__m128i __A, __m128i __B)
 {
   return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comneq_epu16(__m128i __A, __m128i __B)
 {
   return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comfalse_epu16(__m128i __A, __m128i __B)
 {
   return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comtrue_epu16(__m128i __A, __m128i __B)
 {
   return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comlt_epu32(__m128i __A, __m128i __B)
 {
   return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comle_epu32(__m128i __A, __m128i __B)
 {
   return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comgt_epu32(__m128i __A, __m128i __B)
 {
   return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comge_epu32(__m128i __A, __m128i __B)
 {
   return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comeq_epu32(__m128i __A, __m128i __B)
 {
   return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comneq_epu32(__m128i __A, __m128i __B)
 {
   return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comfalse_epu32(__m128i __A, __m128i __B)
 {
   return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comtrue_epu32(__m128i __A, __m128i __B)
 {
   return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comlt_epu64(__m128i __A, __m128i __B)
 {
   return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comle_epu64(__m128i __A, __m128i __B)
 {
   return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comgt_epu64(__m128i __A, __m128i __B)
 {
   return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comge_epu64(__m128i __A, __m128i __B)
 {
   return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comeq_epu64(__m128i __A, __m128i __B)
 {
   return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comneq_epu64(__m128i __A, __m128i __B)
 {
   return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comfalse_epu64(__m128i __A, __m128i __B)
 {
   return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comtrue_epu64(__m128i __A, __m128i __B)
 {
   return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comlt_epi8(__m128i __A, __m128i __B)
 {
   return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comle_epi8(__m128i __A, __m128i __B)
 {
   return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comgt_epi8(__m128i __A, __m128i __B)
 {
   return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comge_epi8(__m128i __A, __m128i __B)
 {
   return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comeq_epi8(__m128i __A, __m128i __B)
 {
   return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comneq_epi8(__m128i __A, __m128i __B)
 {
   return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comfalse_epi8(__m128i __A, __m128i __B)
 {
   return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comtrue_epi8(__m128i __A, __m128i __B)
 {
   return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comlt_epi16(__m128i __A, __m128i __B)
 {
   return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comle_epi16(__m128i __A, __m128i __B)
 {
   return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comgt_epi16(__m128i __A, __m128i __B)
 {
   return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comge_epi16(__m128i __A, __m128i __B)
 {
   return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comeq_epi16(__m128i __A, __m128i __B)
 {
   return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comneq_epi16(__m128i __A, __m128i __B)
 {
   return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comfalse_epi16(__m128i __A, __m128i __B)
 {
   return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comtrue_epi16(__m128i __A, __m128i __B)
 {
   return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comlt_epi32(__m128i __A, __m128i __B)
 {
   return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comle_epi32(__m128i __A, __m128i __B)
 {
   return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comgt_epi32(__m128i __A, __m128i __B)
 {
   return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comge_epi32(__m128i __A, __m128i __B)
 {
   return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comeq_epi32(__m128i __A, __m128i __B)
 {
   return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comneq_epi32(__m128i __A, __m128i __B)
 {
   return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comfalse_epi32(__m128i __A, __m128i __B)
 {
   return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comtrue_epi32(__m128i __A, __m128i __B)
 {
   return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comlt_epi64(__m128i __A, __m128i __B)
 {
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comle_epi64(__m128i __A, __m128i __B)
 {
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comgt_epi64(__m128i __A, __m128i __B)
 {
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comge_epi64(__m128i __A, __m128i __B)
 {
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comeq_epi64(__m128i __A, __m128i __B)
 {
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comneq_epi64(__m128i __A, __m128i __B)
 {
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comfalse_epi64(__m128i __A, __m128i __B)
 {
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
 }
 
-static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_comtrue_epi64(__m128i __A, __m128i __B)
 {
   return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
 }
 
 #define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
-  __m128d __X = (X); \
-  __m128d __Y = (Y); \
-  __m128i __C = (C); \
-  (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \
-                                     (__v2di)__C, (I)); })
+  (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
+                                     (__v2df)(__m128d)(Y), \
+                                     (__v2di)(__m128i)(C), (I)); })
 
 #define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
-  __m256d __X = (X); \
-  __m256d __Y = (Y); \
-  __m256i __C = (C); \
-  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \
-                                        (__v4di)__C, (I)); })
+  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
+                                        (__v4df)(__m256d)(Y), \
+                                        (__v4di)(__m256i)(C), (I)); })
 
 #define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
-  __m128 __X = (X); \
-  __m128 __Y = (Y); \
-  __m128i __C = (C); \
-  (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \
-                                    (__v4si)__C, (I)); })
+  (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
+                                    (__v4si)(__m128i)(C), (I)); })
 
 #define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
-  __m256 __X = (X); \
-  __m256 __Y = (Y); \
-  __m256i __C = (C); \
-  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \
-                                       (__v8si)__C, (I)); })
+  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
+                                       (__v8sf)(__m256)(Y), \
+                                       (__v8si)(__m256i)(C), (I)); })
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_frcz_ss(__m128 __A)
 {
   return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_frcz_sd(__m128d __A)
 {
   return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_frcz_ps(__m128 __A)
 {
   return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
 }
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_frcz_pd(__m128d __A)
 {
   return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
 }
 
-static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_frcz_ps(__m256 __A)
 {
   return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
 }
 
-static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_frcz_pd(__m256d __A)
 {
   return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
 }
 
-#endif /* __XOP__ */
+#undef __DEFAULT_FN_ATTRS
 
 #endif /* __XOPINTRIN_H */
diff --git a/renderscript/clang-include/xsavecintrin.h b/renderscript/clang-include/xsavecintrin.h
new file mode 100644
index 0000000..598470a
--- /dev/null
+++ b/renderscript/clang-include/xsavecintrin.h
@@ -0,0 +1,48 @@
+/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVECINTRIN_H
+#define __XSAVECINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsavec")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/renderscript/clang-include/xsaveintrin.h b/renderscript/clang-include/xsaveintrin.h
new file mode 100644
index 0000000..a2e6b2e
--- /dev/null
+++ b/renderscript/clang-include/xsaveintrin.h
@@ -0,0 +1,58 @@
+/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEINTRIN_H
+#define __XSAVEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsave")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/renderscript/clang-include/xsaveoptintrin.h b/renderscript/clang-include/xsaveoptintrin.h
new file mode 100644
index 0000000..d3faae7
--- /dev/null
+++ b/renderscript/clang-include/xsaveoptintrin.h
@@ -0,0 +1,48 @@
+/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEOPTINTRIN_H
+#define __XSAVEOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaveopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/renderscript/clang-include/xsavesintrin.h b/renderscript/clang-include/xsavesintrin.h
new file mode 100644
index 0000000..c5e540a
--- /dev/null
+++ b/renderscript/clang-include/xsavesintrin.h
@@ -0,0 +1,58 @@
+/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVESINTRIN_H
+#define __XSAVESINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaves")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/renderscript/clang-include/xtestintrin.h b/renderscript/clang-include/xtestintrin.h
new file mode 100644
index 0000000..9d3378f
--- /dev/null
+++ b/renderscript/clang-include/xtestintrin.h
@@ -0,0 +1,41 @@
+/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XTESTINTRIN_H
+#define __XTESTINTRIN_H
+
+/* xtest returns non-zero if the instruction is executed within an RTM or active
+ * HLE region. */
+/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is
+ * supported. */
+static __inline__ int
+    __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+    _xtest(void) {
+  return __builtin_ia32_xtest();
+}
+
+#endif