[X86] Add back fmaddsub intrinsics to work towards fixing the strict fp implementation

Previously we emitted an fmadd and a fmadd+fneg and combined them with a shufflevector. But this doesn't follow the correct exception behavior for unselected elements so the backend can't merge them into the fmaddsub/fmsubadd instructions.

This patch restores the the fmaddsub intrinsics so we don't have two arithmetic operations. We lose out on optimization opportunity in the non-strict FP case, but I don't think this is a big loss. If someone gives us a test case we can look into adding instcombine/dagcombine improvements. I'd rather not have the frontend do completely different things for strict and non-strict.

This still has problems because target specific intrinsics don't support strict semantics yet. We also still have all of the problems with masking. But we at least generate the right instruction in constrained mode now.

Differential Revision: https://reviews.llvm.org/D74268
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index bda01c6..db58738 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10152,7 +10152,8 @@
 
   // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
   if (IID != Intrinsic::not_intrinsic &&
-      cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4) {
+      (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 ||
+       IsAddSub)) {
     Function *Intr = CGF.CGM.getIntrinsic(IID);
     Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
   } else {
@@ -10165,24 +10166,6 @@
       FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
       Res = CGF.Builder.CreateCall(FMA, {A, B, C});
     }
-
-    if (IsAddSub) {
-      // Negate even elts in C using a mask.
-      unsigned NumElts = Ty->getVectorNumElements();
-      SmallVector<uint32_t, 16> Indices(NumElts);
-      for (unsigned i = 0; i != NumElts; ++i)
-        Indices[i] = i + (i % 2) * NumElts;
-
-      // FIXME: This code isn't exception safe for constrained FP. We need to
-      // suppress exceptions on the unselected elements.
-      Value *NegC = CGF.Builder.CreateFNeg(C);
-      Value *FMSub;
-      if (CGF.Builder.getIsFPConstrained())
-        FMSub = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, NegC} );
-      else
-        FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} );
-      Res = CGF.Builder.CreateShuffleVector(FMSub, Res, Indices);
-    }
   }
 
   // Handle any required masking.
@@ -10818,10 +10801,6 @@
   case X86::BI__builtin_ia32_vfmaddpd512_mask3:
   case X86::BI__builtin_ia32_vfmsubpd512_mask3:
     return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/false);
-  case X86::BI__builtin_ia32_vfmaddsubps:
-  case X86::BI__builtin_ia32_vfmaddsubpd:
-  case X86::BI__builtin_ia32_vfmaddsubps256:
-  case X86::BI__builtin_ia32_vfmaddsubpd256:
   case X86::BI__builtin_ia32_vfmaddsubps512_mask:
   case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
   case X86::BI__builtin_ia32_vfmaddsubps512_mask3: