[X86] Lowering FMA intrinsics to native IR (Clang part)

This patch replaces all packed (and scalar without rounding
mode) fused intrinsics with fmadd/fmaddsub variations.
Then fmadd/fmaddsub are lowered to native IR.

Patch by tkrupa

Reviewers: craig.topper, sroland, spatel, RKSimon

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D47444

llvm-svn: 333555
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d4b1165..36c811f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8427,6 +8427,84 @@
   return Res;
 }
 
+// Lowers X86 FMA intrinsics to IR.
+static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
+                             unsigned BuiltinID) {
+
+  bool IsAddSub = false;
+  bool IsScalar = false;
+
+  // 4 operands always means rounding mode without a mask here.
+  bool IsRound = Ops.size() == 4;
+
+  Intrinsic::ID ID;
+  switch (BuiltinID) {
+  default: break;
+  case clang::X86::BI__builtin_ia32_vfmaddss3: IsScalar = true; break;
+  case clang::X86::BI__builtin_ia32_vfmaddsd3: IsScalar = true; break;
+  case clang::X86::BI__builtin_ia32_vfmaddps512:
+    ID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
+  case clang::X86::BI__builtin_ia32_vfmaddpd512:
+    ID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
+  case clang::X86::BI__builtin_ia32_vfmaddsubps: IsAddSub = true; break;
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd: IsAddSub = true; break;
+  case clang::X86::BI__builtin_ia32_vfmaddsubps256: IsAddSub = true; break;
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd256: IsAddSub = true; break;
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512: {
+    ID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
+    IsAddSub = true;
+    break;
+  }
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512: {
+    ID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
+    IsAddSub = true;
+    break;
+  }
+  }
+
+  // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
+  if (IsRound) {
+    Function *Intr = CGF.CGM.getIntrinsic(ID);
+    if (cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != (uint64_t)4)
+      return CGF.Builder.CreateCall(Intr, Ops);
+  }
+
+  Value *A = Ops[0];
+  Value *B = Ops[1];
+  Value *C = Ops[2];
+
+  if (IsScalar) {
+    A = CGF.Builder.CreateExtractElement(A, (uint64_t)0);
+    B = CGF.Builder.CreateExtractElement(B, (uint64_t)0);
+    C = CGF.Builder.CreateExtractElement(C, (uint64_t)0);
+  }
+
+  llvm::Type *Ty = A->getType();
+  Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
+  Value *Res = CGF.Builder.CreateCall(FMA, {A, B, C} );
+
+  if (IsScalar)
+    return CGF.Builder.CreateInsertElement(Ops[0], Res, (uint64_t)0);
+
+  if (IsAddSub) {
+    // Negate even elts in C using a mask.
+    unsigned NumElts = Ty->getVectorNumElements();
+    SmallVector<Constant *, 16> NMask;
+    Constant *Zero = ConstantInt::get(CGF.Builder.getInt1Ty(), 0);
+    Constant *One = ConstantInt::get(CGF.Builder.getInt1Ty(), 1);
+    for (unsigned i = 0; i < NumElts; ++i) {
+      NMask.push_back(i % 2 == 0 ? One : Zero);
+    }
+    Value *NegMask = ConstantVector::get(NMask);
+
+    Value *NegC = CGF.Builder.CreateFNeg(C);
+    Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} );
+    Res = CGF.Builder.CreateSelect(NegMask, FMSub, Res);
+  }
+
+  return Res;
+}
+
 static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
                            ArrayRef<Value *> Ops) {
   llvm::Type *Ty = Ops[0]->getType();
@@ -8820,6 +8898,22 @@
   case X86::BI__builtin_ia32_cvtq2mask512:
     return EmitX86ConvertToMask(*this, Ops[0]);
 
+  case X86::BI__builtin_ia32_vfmaddss3:
+  case X86::BI__builtin_ia32_vfmaddsd3:
+  case X86::BI__builtin_ia32_vfmaddps:
+  case X86::BI__builtin_ia32_vfmaddpd:
+  case X86::BI__builtin_ia32_vfmaddps256:
+  case X86::BI__builtin_ia32_vfmaddpd256:
+  case X86::BI__builtin_ia32_vfmaddps512:
+  case X86::BI__builtin_ia32_vfmaddpd512:
+  case X86::BI__builtin_ia32_vfmaddsubps:
+  case X86::BI__builtin_ia32_vfmaddsubpd:
+  case X86::BI__builtin_ia32_vfmaddsubps256:
+  case X86::BI__builtin_ia32_vfmaddsubpd256:
+  case X86::BI__builtin_ia32_vfmaddsubps512:
+  case X86::BI__builtin_ia32_vfmaddsubpd512:
+    return EmitX86FMAExpr(*this, Ops, BuiltinID);
+
   case X86::BI__builtin_ia32_movdqa32store128_mask:
   case X86::BI__builtin_ia32_movdqa64store128_mask:
   case X86::BI__builtin_ia32_storeaps128_mask:
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index fc030d8..5bfe39e 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -2577,819 +2577,910 @@
                                           (__mmask8)-1, (int)(R)); })
 
 #define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
-                                           (int)(R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(B), \
+                                      (__v8df)(__m512d)(C), (int)(R)); })
 
 
 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), \
-                                           (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       (__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)(__m512d)(A)); })
 
 
 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       (__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)(__m512d)(C)); })
 
 
 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       (__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)_mm512_setzero_pd()); })
 
 
 #define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           -(__v8df)(__m512d)(C), \
-                                           (__mmask8)-1, (int)(R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(B), \
+                                      -(__v8df)(__m512d)(C), \
+                                      (int)(R)); })
 
 
 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           -(__v8df)(__m512d)(C), \
-                                           (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       -(__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)(__m512d)(A)); })
 
 
 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            -(__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       -(__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)_mm512_setzero_pd()); })
 
 
 #define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
-                                           (int)(R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(B), \
+                                      (__v8df)(__m512d)(C), (int)(R)); })
 
 
 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       (__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)(__m512d)(C)); })
 
 
 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       (__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)_mm512_setzero_pd()); })
 
 
 #define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           -(__v8df)(__m512d)(C), \
-                                           (__mmask8)-1, (int)(R)); })
+  (__m512d)__builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(B), \
+                                      -(__v8df)(__m512d)(C), \
+                                      (int)(R)); })
 
 
 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            -(__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       -(__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)_mm512_setzero_pd()); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                               (__v8df) __B,
+                                               (__v8df) __C,
+                                               _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) __A);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) __C);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) _mm512_setzero_pd());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    -(__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                               (__v8df) __B,
+                                               -(__v8df) __C,
+                                               _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    -(__v8df) __C,
-                                                    (__mmask8) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                                (__v8df) __B,
+                                                -(__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) __A);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     -(__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                                (__v8df) __B,
+                                                -(__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) _mm512_setzero_pd());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
-                                                    (__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
+                                               (__v8df) __B,
+                                               (__v8df) __C,
+                                               _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) __C);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) _mm512_setzero_pd());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
-                                                    (__v8df) __B,
-                                                    -(__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
+                                               (__v8df) __B,
+                                               -(__v8df) __C,
+                                               _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
-                                                     (__v8df) __B,
-                                                     -(__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
+                                                (__v8df) __B,
+                                                -(__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) _mm512_setzero_pd());
 }
 
 #define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
-                                          (int)(R)); })
+  (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(B), \
+                                     (__v16sf)(__m512)(C), (int)(R)); })
 
 
 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), \
-                                          (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      (__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)(__m512)(A)); })
 
 
 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      (__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)(__m512)(C)); })
 
 
 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      (__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)_mm512_setzero_ps()); })
 
 
 #define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          -(__v16sf)(__m512)(C), \
-                                          (__mmask16)-1, (int)(R)); })
+  (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(B), \
+                                     -(__v16sf)(__m512)(C), \
+                                     (int)(R)); })
 
 
 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          -(__v16sf)(__m512)(C), \
-                                          (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      -(__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)(__m512)(A)); })
 
 
 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           -(__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      -(__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)_mm512_setzero_ps()); })
 
 
 #define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
-                                          (int)(R)); })
+  (__m512)__builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(B), \
+                                     (__v16sf)(__m512)(C), (int)(R)); })
 
 
 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      (__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)(__m512)(C)); })
 
 
 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      (__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)_mm512_setzero_ps()); })
 
 
 #define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          -(__v16sf)(__m512)(C), \
-                                          (__mmask16)-1, (int)(R)); })
+  (__m512)__builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(B), \
+                                     -(__v16sf)(__m512)(C), \
+                                     (int)(R)); })
 
 
 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           -(__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      -(__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)_mm512_setzero_ps()); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   (__mmask16) -1,
-                                                   _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                              (__v16sf) __B,
+                                              (__v16sf) __C,
+                                              _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   (__mmask16) __U,
-                                                   _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION), \
+            (__v16sf) __A);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION), \
+            (__v16sf) __C);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) _mm512_setzero_ps());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   (__mmask16) -1,
-                                                   _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                              (__v16sf) __B,
+                                              -(__v16sf) __C,
+                                              _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   (__mmask16) __U,
-                                                   _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                                (__v16sf) __B,
+                                                -(__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) __A);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    -(__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                                (__v16sf) __B,
+                                                -(__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) _mm512_setzero_ps());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   (__mmask16) -1,
-                                                   _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
+                                              (__v16sf) __B,
+                                              (__v16sf) __C,
+                                              _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) __C);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) _mm512_setzero_ps());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   (__mmask16) -1,
-                                                   _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
+                                              (__v16sf) __B,
+                                              -(__v16sf) __C,
+                                              _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    -(__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
+                                                (__v16sf) __B,
+                                                -(__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) _mm512_setzero_ps());
 }
 
 #define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8df)(__m512d)(C), \
-                                              (__mmask8)-1, (int)(R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), \
+                                         (__v8df)(__m512d)(C), \
+                                         (int)(R)); })
 
 
 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8df)(__m512d)(C), \
-                                              (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), \
+                                          (__v8df)(__m512d)(C), \
+                                          (int)(R)), \
+            (__v8df)(__m512d)(A)); })
 
 
 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), \
+                                          (__v8df)(__m512d)(C), \
+                                          (int)(R)), \
+            (__v8df)(__m512d)(C)); })
 
 
 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), \
+                                          (__v8df)(__m512d)(C), \
+                                          (int)(R)), \
+            (__v8df)_mm512_setzero_pd()); })
 
 
 #define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              -(__v8df)(__m512d)(C), \
-                                              (__mmask8)-1, (int)(R)); })
+  (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), \
+                                         -(__v8df)(__m512d)(C), \
+                                         (int)(R)); })
 
 
 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              -(__v8df)(__m512d)(C), \
-                                              (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), \
+                                          -(__v8df)(__m512d)(C), \
+                                          (int)(R)), \
+            (__v8df)(__m512d)(A)); })
 
 
 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               -(__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), \
+                                          -(__v8df)(__m512d)(C), \
+                                          (int)(R)), \
+            (__v8df)_mm512_setzero_pd()); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       (__v8df) __C,
-                                                       (__mmask8) -1,
-                                                       _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
+                                                  (__v8df) __B,
+                                                  (__v8df) __C,
+                                                  _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       (__v8df) __C,
-                                                       (__mmask8) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   _MM_FROUND_CUR_DIRECTION),
+            (__v8df) __A);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
-                                                        (__v8df) __B,
-                                                        (__v8df) __C,
-                                                        (__mmask8) __U,
-                                                        _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   _MM_FROUND_CUR_DIRECTION),
+            (__v8df) __C);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
-                                                        (__v8df) __B,
-                                                        (__v8df) __C,
-                                                        (__mmask8) __U,
-                                                        _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   _MM_FROUND_CUR_DIRECTION),
+            (__v8df) _mm512_setzero_pd());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       -(__v8df) __C,
-                                                       (__mmask8) -1,
-                                                       _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
+                                                  (__v8df) __B,
+                                                  -(__v8df) __C,
+                                                  _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       -(__v8df) __C,
-                                                       (__mmask8) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   -(__v8df) __C,
+                                                   _MM_FROUND_CUR_DIRECTION),
+            (__v8df) __A);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
-                                                        (__v8df) __B,
-                                                        -(__v8df) __C,
-                                                        (__mmask8) __U,
-                                                        _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   -(__v8df) __C,
+                                                   _MM_FROUND_CUR_DIRECTION),
+            (__v8df) _mm512_setzero_pd());
 }
 
 #define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16sf)(__m512)(C), \
-                                             (__mmask16)-1, (int)(R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(B), \
+                                        (__v16sf)(__m512)(C), \
+                                        (int)(R)); })
 
 
 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16sf)(__m512)(C), \
-                                             (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), \
+                                         (__v16sf)(__m512)(C), \
+                                         (int)(R)), \
+            (__v16sf)(__m512)(A)); })
 
 
 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), \
+                                         (__v16sf)(__m512)(C), \
+                                         (int)(R)), \
+            (__v16sf)(__m512)(C)); })
 
 
 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), \
+                                         (__v16sf)(__m512)(C), \
+                                         (int)(R)), \
+            (__v16sf)_mm512_setzero_ps()); })
 
 
 #define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             -(__v16sf)(__m512)(C), \
-                                             (__mmask16)-1, (int)(R)); })
+  (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(B), \
+                                        -(__v16sf)(__m512)(C), \
+                                        (int)(R)); })
 
 
 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             -(__v16sf)(__m512)(C), \
-                                             (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), \
+                                         -(__v16sf)(__m512)(C), \
+                                         (int)(R)), \
+            (__v16sf)(__m512)(A)); })
 
 
 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              -(__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), \
+                                         -(__v16sf)(__m512)(C), \
+                                         (int)(R)), \
+            (__v16sf)_mm512_setzero_ps()); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      (__v16sf) __C,
-                                                      (__mmask16) -1,
-                                                      _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
+                                                 (__v16sf) __B,
+                                                 (__v16sf) __C,
+                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      (__v16sf) __C,
-                                                      (__mmask16) __U,
-                                                      _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   _MM_FROUND_CUR_DIRECTION), \
+            (__v16sf) __A);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       (__v16sf) __C,
-                                                       (__mmask16) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   _MM_FROUND_CUR_DIRECTION), \
+            (__v16sf) __C);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       (__v16sf) __C,
-                                                       (__mmask16) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) _mm512_setzero_ps());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      -(__v16sf) __C,
-                                                      (__mmask16) -1,
-                                                      _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
+                                                 (__v16sf) __B,
+                                                 -(__v16sf) __C,
+                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      -(__v16sf) __C,
-                                                      (__mmask16) __U,
-                                                      _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) __A);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       -(__v16sf) __C,
-                                                       (__mmask16) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) _mm512_setzero_ps());
 }
 
 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                    (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                                        (__v8df)(__m512d)(B), \
+                                                        -(__v8df)(__m512d)(C), \
+                                                        (int)(R)), \
+                    (__v8df)(__m512d)(C)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    (__m512d)__builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                                         (__v8df) __B,
+                                                         -(__v8df) __C,
+                                                         _MM_FROUND_CUR_DIRECTION),
+                    (__v8df) __C);
 }
 
 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
-
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                   (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
+                                                      (__v16sf)(__m512)(B), \
+                                                      -(__v16sf)(__m512)(C), \
+                                                      (int)(R)), \
+                   (__v16sf)(__m512)(C)); })
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    (__m512)__builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                                        (__v16sf) __B,
+                                                        -(__v16sf) __C,
+                                                        _MM_FROUND_CUR_DIRECTION),
+                    (__v16sf) __C);
 }
 
 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                    (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
+                                                           (__v8df)(__m512d)(B), \
+                                                           -(__v8df)(__m512d)(C), \
+                                                           (int)(R)), \
+                    (__v8df)(__m512d)(C)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
-                                                        (__v8df) __B,
-                                                        (__v8df) __C,
-                                                        (__mmask8) __U,
-                                                        _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    (__m512d)__builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
+                                                            (__v8df) __B,
+                                                            -(__v8df) __C,
+                                                            _MM_FROUND_CUR_DIRECTION),
+                    (__v8df) __C);
 }
 
 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                    (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
+                                                          (__v16sf)(__m512)(B), \
+                                                          -(__v16sf)(__m512)(C), \
+                                                          (int)(R)), \
+                    (__v16sf)(__m512)(C)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       (__v16sf) __C,
-                                                       (__mmask16) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    (__m512)__builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
+                                                           (__v16sf) __B,
+                                                           -(__v16sf) __C,
+                                                           _MM_FROUND_CUR_DIRECTION),
+                    (__v16sf) __C);
 }
 
 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(B), \
+                                       (__v8df)(__m512d)(C), \
+                                       (int)(R)), \
+            (__v8df)(__m512d)(A)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v8df) __A);
 }
 
 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+           __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(B), \
+                                      (__v16sf)(__m512)(C), \
+                                      (int)(R)), \
+            (__v16sf)(__m512)(A)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                    __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf) __C,
+                                                _MM_FROUND_CUR_DIRECTION),
+            (__v16sf) __A);
 }
 
 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                                -(__v8df)(__m512d)(B), \
+                                                -(__v8df)(__m512d)(C), \
+                                                (int)(R)), \
+            (__v8df)(__m512d)(A)); })
 
 
 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
-  (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)); })
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+            (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
+                                                -(__v8df)(__m512d)(B), \
+                                                -(__v8df)(__m512d)(C), \
+                                                (int)(R)), \
+            (__v8df)(__m512d)(C)); })
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask16) __U,
+                    (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                                           -(__v8df) __B,
+                                                           -(__v8df) __C,
+                                                           _MM_FROUND_CUR_DIRECTION),
+                    (__v8df) __A);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
-                                                      (__v8df) __B,
-                                                      (__v8df) __C,
-                                                      (__mmask8) __U,
-                                                      _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_selectpd_512((__mmask16) __U,
+                    (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A,
+                                                           -(__v8df) __B,
+                                                           -(__v8df) __C,
+                                                           _MM_FROUND_CUR_DIRECTION),
+                    (__v8df) __C);
 }
 
 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+            (__m512)__builtin_ia32_vfmaddps512 ((__v16sf)(__m512)(A), \
+                                                -(__v16sf)(__m512)(B), \
+                                                -(__v16sf)(__m512)(C), \
+                                               (int)(R)), \
+            (__v16sf)(__m512)(A)); })
 
 
 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
-  (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)); })
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+            (__m512)__builtin_ia32_vfmaddps512 ((__v16sf)(__m512)(A), \
+                                                -(__v16sf)(__m512)(B), \
+                                                -(__v16sf)(__m512)(C), \
+                                               (int)(R)), \
+            (__v16sf)(__m512)(C)); })
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                   (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                                        -(__v16sf) __B,
+                                                        -(__v16sf) __C,
+                                                        _MM_FROUND_CUR_DIRECTION),
+                   (__v16sf) __A);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
-                                                     (__v16sf) __B,
-                                                     (__v16sf) __C,
-                                                     (__mmask16) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
+                   (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A,
+                                                        -(__v16sf) __B,
+                                                        -(__v16sf) __C,
+                                                        _MM_FROUND_CUR_DIRECTION),
+                   (__v16sf) __C);
 }
 
 
@@ -8112,27 +8203,27 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
-          (__v4sf) __A,
-          (__v4sf) __B,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W,
+                                       (__v4sf) __A,
+                                       (__v4sf) __B);
+ __W[0] = (__U & 1) ? __Z[0] : __W[0];
+ return __W;
 }
 
 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                        (__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
-                                        (int)(R)); })
+  (__m128d)__builtin_ia32_vfmaddss3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
- return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __A,
+                                       (__v4sf) __B,
+                                       (__v4sf) __C);
+ __A[0] = (__U & 1) ? __Z[0] : 0;
+ return __A;
 }
 
 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\
@@ -8144,11 +8235,11 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
-          (__v4sf) __X,
-          (__v4sf) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W,
+                                       (__v4sf) __X,
+                                       (__v4sf) __Y);
+ __Y[0] = (__U & 1) ? __Z[0] : __Y[0];
+ return __Y;
 }
 
 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\
@@ -8160,27 +8251,27 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
-          (__v4sf) __A,
-          -(__v4sf) __B,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W,
+                                       (__v4sf) __A,
+                                       -(__v4sf) __B);
+ __W[0] = (__U & 1) ? __Z[0] : __W[0];
+ return __W;
 }
 
 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
   (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
                                         (__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        -(__v4sf)(__m128)(B), (__mmask8)(U), \
                                         (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
- return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
-          (__v4sf) __B,
-          -(__v4sf) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __A,
+                                       (__v4sf) __B,
+                                       -(__v4sf) __C);
+ __A[0] = (__U & 1) ? __Z[0] : 0;
+ return __A;
 }
 
 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\
@@ -8192,11 +8283,11 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
-          (__v4sf) __X,
-          (__v4sf) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W,
+                                       (__v4sf) __X,
+                                       -(__v4sf) __Y);
+ __Y[0] = (__U & 1) ? __Z[0] : __Y[0];
+ return __Y;
 }
 
 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
@@ -8208,11 +8299,11 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
-          -(__v4sf) __A,
-          (__v4sf) __B,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W,
+                                       -(__v4sf) __A,
+                                       (__v4sf) __B);
+ __W[0] = (__U & 1) ? __Z[0] : __W[0];
+ return __W;
 }
 
 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
@@ -8224,43 +8315,43 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
- return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __A,
+                                       -(__v4sf) __B,
+                                       (__v4sf) __C);
+ __A[0] = (__U & 1) ? __Z[0] : 0;
+ return __A;
 }
 
 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         -(__v4sf)(__m128)(B), \
                                          (__v4sf)(__m128)(C), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
-          (__v4sf) __X,
-          (__v4sf) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W,
+                                       -(__v4sf) __X,
+                                       (__v4sf) __Y);
+ __Y[0] = (__U & 1) ? __Z[0] : __Y[0];
+ return __Y;
 }
 
 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\
-  (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
-                                         (__v4sf)(__m128)(X), \
+  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+                                         -(__v4sf)(__m128)(X), \
                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
-          -(__v4sf) __A,
-          -(__v4sf) __B,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W,
+                                       -(__v4sf) __A,
+                                       -(__v4sf) __B);
+ __W[0] = (__U & 1) ? __Z[0] : __W[0];
+ return __W;
 }
 
 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
@@ -8272,43 +8363,43 @@
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
- return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
-          (__v4sf) __B,
-          -(__v4sf) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __A,
+                                       -(__v4sf) __B,
+                                       -(__v4sf) __C);
+ __A[0] = (__U & 1) ? __Z[0] : 0;
+ return __A;
 }
 
 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\
-  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         -(__v4sf)(__m128)(B), \
                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
                                          _MM_FROUND_CUR_DIRECTION); })
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 {
- return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W,
-          (__v4sf) __X,
-          (__v4sf) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W,
+                                       -(__v4sf) __X,
+                                       -(__v4sf) __Y);
+ __Y[0] = (__U & 1) ? __Z[0] : __Y[0];
+ return __Y;
 }
 
 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
-  (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \
-                                         (__v4sf)(__m128)(X), \
+  (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
+                                         -(__v4sf)(__m128)(X), \
                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
                                          (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
-          (__v2df) __A,
-          (__v2df) __B,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W,
+                                        (__v2df) __A,
+                                        (__v2df) __B);
+ __W[0] = (__U & 1) ? __Z[0] : __W[0];
+ return __W;
 }
 
 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
@@ -8320,11 +8411,11 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __A,
+                                        (__v2df) __B,
+                                        (__v2df) __C);
+ __A[0] = (__U & 1) ? __Z[0] : 0;
+ return __A;
 }
 
 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\
@@ -8336,11 +8427,11 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
-          (__v2df) __X,
-          (__v2df) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W,
+                                        (__v2df) __X,
+                                        (__v2df) __Y);
+ __Y[0] = (__U & 1) ? __Z[0] : __Y[0];
+ return __Y;
 }
 
 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\
@@ -8352,11 +8443,11 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
-          (__v2df) __A,
-          -(__v2df) __B,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W,
+                                        (__v2df) __A,
+                                        -(__v2df) __B);
+ __W[0] = (__U & 1) ? __Z[0] : __W[0];
+ return __W;
 }
 
 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
@@ -8368,11 +8459,11 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
-          (__v2df) __B,
-          -(__v2df) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __A,
+                                        (__v2df) __B,
+                                        -(__v2df) __C);
+ __A[0] = (__U & 1) ? __Z[0] : 0;
+ return __A;
 }
 
 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\
@@ -8384,11 +8475,11 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
-          (__v2df) __X,
-          (__v2df) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W,
+                                        (__v2df) __X,
+                                        -(__v2df) __Y);
+ __Y[0] = (__U & 1) ? __Z[0] : __Y[0];
+ return __Y;
 }
 
 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
@@ -8400,11 +8491,11 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
-          -(__v2df) __A,
-          (__v2df) __B,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W,
+                                        -(__v2df) __A,
+                                        (__v2df) __B);
+ __W[0] = (__U & 1) ? __Z[0] : __W[0];
+ return __W;
 }
 
 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
@@ -8416,43 +8507,43 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __A,
+                                        -(__v2df) __B,
+                                        (__v2df) __C);
+ __A[0] = (__U & 1) ? __Z[0] : 0;
+ return __A;
 }
 
 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          -(__v2df)(__m128d)(B), \
                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
                                           (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W,
-          (__v2df) __X,
-          (__v2df) __Y,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W,
+                                        -(__v2df) __X,
+                                        (__v2df) __Y);
+ __Y[0] = (__U & 1) ? __Z[0] : __Y[0];
+ return __Y;
 }
 
 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\
-  (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
-                                          (__v2df)(__m128d)(X), \
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+                                          -(__v2df)(__m128d)(X), \
                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
                                           (int)(R)); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
-          -(__v2df) __A,
-          -(__v2df) __B,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W,
+                                        -(__v2df) __A,
+                                        -(__v2df) __B);
+ __W[0] = (__U & 1) ? __Z[0] : __W[0];
+ return __W;
 }
 
 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
@@ -8464,16 +8555,16 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
- return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
-          (__v2df) __B,
-          -(__v2df) __C,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __A,
+                                        -(__v2df) __B,
+                                        -(__v2df) __C);
+ __A[0] = (__U & 1) ? __Z[0] : 0;
+ return __A;
 }
 
 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\
-  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          -(__v2df)(__m128d)(B), \
                                           -(__v2df)(__m128d)(C), \
                                           (__mmask8)(U), \
                                           _MM_FROUND_CUR_DIRECTION); })
@@ -8481,16 +8572,16 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 {
- return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W),
-          (__v2df) __X,
-          (__v2df) (__Y),
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
+ __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W,
+                                        -(__v2df) __X,
+                                        -(__v2df) __Y);
+ __Y[0] = (__U & 1) ? __Z[0] : __Y[0];
+ return __Y;
 }
 
 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
-  (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \
-                                          (__v2df)(__m128d)(X), \
+  (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
+                                          -(__v2df)(__m128d)(X), \
                                           (__v2df)(__m128d)(Y), \
                                           (__mmask8)(U), (int)(R)); })
 
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 833e70a..9a2bf34 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -813,658 +813,722 @@
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
-                                                    (__v2df) __B,
-                                                    (__v2df) __C,
-                                                    (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df) __A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df) __C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
-                                                    (__v2df) __B,
-                                                    -(__v2df) __C,
-                                                    (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df) __A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     -(__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df) __C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
-                                                     (__v2df) __B,
-                                                     -(__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
+                                             (__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
-                                                    (__v4df) __B,
-                                                    (__v4df) __C,
-                                                    (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df) __A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df) __C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
-                                                    (__v4df) __B,
-                                                    -(__v4df) __C,
-                                                    (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df) __A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     -(__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df) __C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
-                                                     (__v4df) __B,
-                                                     -(__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+                                                (__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
-                                                   (__v4sf) __B,
-                                                   (__v4sf) __C,
-                                                   (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf) __A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf) __C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
-                                                   (__v4sf) __B,
-                                                   -(__v4sf) __C,
-                                                   (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf) __A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    -(__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf) __C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
+                                             (__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    -(__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
+                                             (__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   (__v8sf) __C,
-                                                   (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf) __A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf) __C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   -(__v8sf) __C,
-                                                   (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf) __A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    -(__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf) __C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    -(__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+                                                (__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
-                                                       (__v2df) __B,
-                                                       (__v2df) __C,
-                                                       (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __C),
+                    (__v2df) __A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
-                                                        (__v2df) __B,
-                                                        (__v2df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __C),
+                    (__v2df) __C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
-                                                        (__v2df) __B,
-                                                        (__v2df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
-                                                       (__v2df) __B,
-                                                       -(__v2df) __C,
-                                                       (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                -(__v2df) __C),
+                    (__v2df) __A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
-                                                        (__v2df) __B,
-                                                        -(__v2df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                -(__v2df) __C),
+                    (__v2df)_mm_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
-                                                       (__v4df) __B,
-                                                       (__v4df) __C,
-                                                       (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __C),
+                    (__v4df) __A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
-                                                        (__v4df) __B,
-                                                        (__v4df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __C),
+                    (__v4df) __C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
-                                                        (__v4df) __B,
-                                                        (__v4df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
-                                                       (__v4df) __B,
-                                                       -(__v4df) __C,
-                                                       (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   -(__v4df) __C),
+                    (__v4df) __A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
-                                                        (__v4df) __B,
-                                                        -(__v4df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   -(__v4df) __C),
+                    (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
-                                                      (__v4sf) __B,
-                                                      (__v4sf) __C,
-                                                      (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __C),
+                    (__v4sf) __A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
-                                                       (__v4sf) __B,
-                                                       (__v4sf) __C,
-                                                       (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __C),
+                    (__v4sf) __C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
-                                                       (__v4sf) __B,
-                                                       (__v4sf) __C,
-                                                       (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
-                                                      (__v4sf) __B,
-                                                      -(__v4sf) __C,
-                                                      (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                -(__v4sf) __C),
+                    (__v4sf) __A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
-                                                       (__v4sf) __B,
-                                                       -(__v4sf) __C,
-                                                       (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                -(__v4sf) __C),
+                    (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
                          __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
-                                                      (__v8sf) __B,
-                                                      (__v8sf) __C,
-                                                      (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C),
+                    (__v8sf) __A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
-                                                       (__v8sf) __B,
-                                                       (__v8sf) __C,
-                                                       (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C),
+                    (__v8sf) __C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
-                                                       (__v8sf) __B,
-                                                       (__v8sf) __C,
-                                                       (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
-                                                      (__v8sf) __B,
-                                                      -(__v8sf) __C,
-                                                      (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C),
+                    (__v8sf) __A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
-                                                       (__v8sf) __B,
-                                                       -(__v8sf) __C,
-                                                       (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C),
+                    (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             (__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df) __C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                (__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df) __C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf) __C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf) __C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
-                                                        (__v2df) __B,
-                                                        (__v2df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+                                                (__v2df) __B,
+                                                -(__v2df) __C),
+                    (__v2df) __C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
-                                                        (__v4df) __B,
-                                                        (__v4df) __C,
-                                                        (__mmask8)
-                                                        __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   -(__v4df) __C),
+                    (__v4df) __C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
-                                                       (__v4sf) __B,
-                                                       (__v4sf) __C,
-                                                       (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                -(__v4sf) __C),
+                    (__v4sf) __C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
-                                                       (__v8sf) __B,
-                                                       (__v8sf) __C,
-                                                       (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C),
+                    (__v8sf) __C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             -(__v2df) __B,
+                                             (__v2df) __C),
+                    (__v2df) __A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                -(__v4df) __B,
+                                                (__v4df) __C),
+                    (__v4df) __A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             -(__v4sf) __B,
+                                             (__v4sf) __C),
+                    (__v4sf) __A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                -(__v8sf) __B,
+                                                (__v8sf) __C),
+                    (__v8sf) __A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
 {
-  return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2df) __C,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             -(__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df) __A);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
 {
-  return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
-                                                      (__v2df) __B,
-                                                      (__v2df) __C,
-                                                      (__mmask8) __U);
+  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd ((__v2df) __A,
+                                             -(__v2df) __B,
+                                             -(__v2df) __C),
+                    (__v2df) __C);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
 {
-  return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4df) __C,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                -(__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df) __A);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
 {
-  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
-                                                      (__v4df) __B,
-                                                      (__v4df) __C,
-                                                      (__mmask8) __U);
+  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+                                                -(__v4df) __B,
+                                                -(__v4df) __C),
+                    (__v4df) __C);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
 {
-  return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4sf) __C,
-                                                    (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             -(__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf) __A);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
 {
-  return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
-                                                     (__v4sf) __B,
-                                                     (__v4sf) __C,
-                                                     (__mmask8) __U);
+  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+                    __builtin_ia32_vfmaddps ((__v4sf) __A,
+                                             -(__v4sf) __B,
+                                             -(__v4sf) __C),
+                    (__v4sf) __C);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
 {
-  return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8sf) __C,
-                                                    (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                -(__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf) __A);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
 {
-  return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
-                                                     (__v8sf) __B,
-                                                     (__v8sf) __C,
-                                                     (__mmask8) __U);
+  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+                                                -(__v8sf) __B,
+                                                -(__v8sf) __C),
+                    (__v8sf) __C);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2f15c51..7b9975e 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2340,6 +2340,10 @@
   case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
   case X86::BI__builtin_ia32_sqrtpd512_mask:
   case X86::BI__builtin_ia32_sqrtps512_mask:
+  case X86::BI__builtin_ia32_vfmaddpd512:
+  case X86::BI__builtin_ia32_vfmaddps512:
+  case X86::BI__builtin_ia32_vfmaddsubpd512:
+  case X86::BI__builtin_ia32_vfmaddsubps512:
     ArgNum = 3;
     HasRC = true;
     break;
@@ -2368,28 +2372,6 @@
   case X86::BI__builtin_ia32_cvtsd2ss_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask:
-  case X86::BI__builtin_ia32_vfmaddpd512_mask:
-  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
-  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
-  case X86::BI__builtin_ia32_vfmaddps512_mask:
-  case X86::BI__builtin_ia32_vfmaddps512_mask3:
-  case X86::BI__builtin_ia32_vfmaddps512_maskz:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
-  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
-  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
-  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
-  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
-  case X86::BI__builtin_ia32_vfmsubps512_mask3:
-  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
-  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
-  case X86::BI__builtin_ia32_vfnmaddpd512_mask:
-  case X86::BI__builtin_ia32_vfnmaddps512_mask:
-  case X86::BI__builtin_ia32_vfnmsubpd512_mask:
-  case X86::BI__builtin_ia32_vfnmsubpd512_mask3:
-  case X86::BI__builtin_ia32_vfnmsubps512_mask:
-  case X86::BI__builtin_ia32_vfnmsubps512_mask3:
   case X86::BI__builtin_ia32_vfmaddsd3_mask:
   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
   case X86::BI__builtin_ia32_vfmaddsd3_mask3: