[FPEnv][AArch64] Platform-specific builtin constrained FP enablement

When constrained floating point is enabled the AArch64-specific builtins don't use constrained intrinsics in some cases. Fix that.

Neon is part of this patch, so ARM is affected as well.

Differential Revision: https://reviews.llvm.org/D77074
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 880fe0e..f985e58 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -411,6 +411,25 @@
   }
 }
 
+// Emit an intrinsic where all operands are of the same type as the result.
+// Depending on mode, this may be a constrained floating-point intrinsic.
+static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
+                                                unsigned IntrinsicID,
+                                                unsigned ConstrainedIntrinsicID,
+                                                llvm::Type *Ty,
+                                                ArrayRef<Value *> Args) {
+  Function *F;
+  if (CGF.Builder.getIsFPConstrained())
+    F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Ty);
+  else
+    F = CGF.CGM.getIntrinsic(IntrinsicID, Ty);
+
+  if (CGF.Builder.getIsFPConstrained())
+    return CGF.Builder.CreateConstrainedFPCall(F, Args);
+  else
+    return CGF.Builder.CreateCall(F, Args);
+}
+
 // Emit a simple mangled intrinsic that has 1 argument and a return type
 // matching the argument type.
 static Value *emitUnaryBuiltin(CodeGenFunction &CGF,
@@ -4511,13 +4530,20 @@
                                      unsigned shift, bool rightshift) {
   unsigned j = 0;
   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-       ai != ae; ++ai, ++j)
+       ai != ae; ++ai, ++j) {
+    if (F->isConstrainedFPIntrinsic())
+      if (ai->getType()->isMetadataTy())
+        continue;
     if (shift > 0 && shift == j)
       Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
     else
       Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
+  }
 
-  return Builder.CreateCall(F, Ops, name);
+  if (F->isConstrainedFPIntrinsic())
+    return Builder.CreateConstrainedFPCall(F, Ops, name);
+  else
+    return Builder.CreateCall(F, Ops, name);
 }
 
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
@@ -5695,13 +5721,14 @@
   }
   case NEON::BI__builtin_neon_vfma_v:
   case NEON::BI__builtin_neon_vfmaq_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
 
     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
-    return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
+    return emitCallMaybeConstrainedFPBuiltin(
+        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
+        {Ops[1], Ops[2], Ops[0]});
   }
   case NEON::BI__builtin_neon_vld1_v:
   case NEON::BI__builtin_neon_vld1q_v: {
@@ -5859,7 +5886,9 @@
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
   case NEON::BI__builtin_neon_vrndi_v:
   case NEON::BI__builtin_neon_vrndiq_v:
-    Int = Intrinsic::nearbyint;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_nearbyint
+              : Intrinsic::nearbyint;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
   case NEON::BI__builtin_neon_vrshr_n_v:
   case NEON::BI__builtin_neon_vrshrq_n_v:
@@ -8459,18 +8488,20 @@
   case NEON::BI__builtin_neon_vdivh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(1)));
     return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
-  case NEON::BI__builtin_neon_vfmah_f16: {
-    Function *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+  case NEON::BI__builtin_neon_vfmah_f16:
     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
-    return Builder.CreateCall(F,
-      {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
-  }
+    return emitCallMaybeConstrainedFPBuiltin(
+        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
+        {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
   case NEON::BI__builtin_neon_vfmsh_f16: {
-    Function *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+    // FIXME: This should be an fneg instruction:
     Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy);
     Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh");
+
     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
-    return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
+    return emitCallMaybeConstrainedFPBuiltin(
+        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
+        {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
   }
   case NEON::BI__builtin_neon_vaddd_s64:
   case NEON::BI__builtin_neon_vaddd_u64:
@@ -8837,7 +8868,8 @@
     Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
 
     Ops.pop_back();
-    Int = Intrinsic::fma;
+    Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
+                                       : Intrinsic::fma;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
   }
   case NEON::BI__builtin_neon_vfma_laneq_v: {
@@ -8850,11 +8882,12 @@
         NeonTypeFlags(NeonTypeFlags::Float64, false, true));
       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
-      Function *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
-      Value *Result = Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
+      Value *Result;
+      Result = emitCallMaybeConstrainedFPBuiltin(
+          *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma,
+          DoubleTy, {Ops[1], Ops[2], Ops[0]});
       return Builder.CreateBitCast(Result, Ty);
     }
-    Function *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
 
@@ -8865,16 +8898,19 @@
                                                cast<ConstantInt>(Ops[3]));
     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
 
-    return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
+    return emitCallMaybeConstrainedFPBuiltin(
+        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
+        {Ops[2], Ops[1], Ops[0]});
   }
   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
 
     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
-    return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
+    return emitCallMaybeConstrainedFPBuiltin(
+        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
+        {Ops[2], Ops[1], Ops[0]});
   }
   case NEON::BI__builtin_neon_vfmah_lane_f16:
   case NEON::BI__builtin_neon_vfmas_lane_f32:
@@ -8884,9 +8920,10 @@
   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
     Ops.push_back(EmitScalarExpr(E->getArg(3)));
     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
-    Function *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
-    return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
+    return emitCallMaybeConstrainedFPBuiltin(
+        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
+        {Ops[1], Ops[2], Ops[0]});
   }
   case NEON::BI__builtin_neon_vmull_v:
     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
@@ -8995,27 +9032,37 @@
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
   case NEON::BI__builtin_neon_vrndah_f16: {
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Int = Intrinsic::round;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_round
+              : Intrinsic::round;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
   }
   case NEON::BI__builtin_neon_vrnda_v:
   case NEON::BI__builtin_neon_vrndaq_v: {
-    Int = Intrinsic::round;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_round
+              : Intrinsic::round;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
   }
   case NEON::BI__builtin_neon_vrndih_f16: {
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Int = Intrinsic::nearbyint;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_nearbyint
+              : Intrinsic::nearbyint;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
   }
   case NEON::BI__builtin_neon_vrndmh_f16: {
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Int = Intrinsic::floor;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_floor
+              : Intrinsic::floor;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
   }
   case NEON::BI__builtin_neon_vrndm_v:
   case NEON::BI__builtin_neon_vrndmq_v: {
-    Int = Intrinsic::floor;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_floor
+              : Intrinsic::floor;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
   }
   case NEON::BI__builtin_neon_vrndnh_f16: {
@@ -9035,32 +9082,44 @@
   }
   case NEON::BI__builtin_neon_vrndph_f16: {
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Int = Intrinsic::ceil;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_ceil
+              : Intrinsic::ceil;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
   }
   case NEON::BI__builtin_neon_vrndp_v:
   case NEON::BI__builtin_neon_vrndpq_v: {
-    Int = Intrinsic::ceil;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_ceil
+              : Intrinsic::ceil;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
   }
   case NEON::BI__builtin_neon_vrndxh_f16: {
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Int = Intrinsic::rint;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_rint
+              : Intrinsic::rint;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
   }
   case NEON::BI__builtin_neon_vrndx_v:
   case NEON::BI__builtin_neon_vrndxq_v: {
-    Int = Intrinsic::rint;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_rint
+              : Intrinsic::rint;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
   }
   case NEON::BI__builtin_neon_vrndh_f16: {
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Int = Intrinsic::trunc;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_trunc
+              : Intrinsic::trunc;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
   }
   case NEON::BI__builtin_neon_vrnd_v:
   case NEON::BI__builtin_neon_vrndq_v: {
-    Int = Intrinsic::trunc;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_trunc
+              : Intrinsic::trunc;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
   }
   case NEON::BI__builtin_neon_vcvt_f64_v:
@@ -9211,12 +9270,16 @@
   }
   case NEON::BI__builtin_neon_vsqrth_f16: {
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Int = Intrinsic::sqrt;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_sqrt
+              : Intrinsic::sqrt;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
   }
   case NEON::BI__builtin_neon_vsqrt_v:
   case NEON::BI__builtin_neon_vsqrtq_v: {
-    Int = Intrinsic::sqrt;
+    Int = Builder.getIsFPConstrained()
+              ? Intrinsic::experimental_constrained_sqrt
+              : Intrinsic::sqrt;
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
   }