[FPEnv][AArch64] Platform-specific builtin constrained FP enablement
When constrained floating point is enabled the AArch64-specific builtins don't use constrained intrinsics in some cases. Fix that.
Neon is part of this patch, so ARM is affected as well.
Differential Revision: https://reviews.llvm.org/D77074
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 880fe0e..f985e58 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -411,6 +411,25 @@
}
}
+// Emit an intrinsic where all operands are of the same type as the result.
+// Depending on mode, this may be a constrained floating-point intrinsic.
+static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
+ unsigned IntrinsicID,
+ unsigned ConstrainedIntrinsicID,
+ llvm::Type *Ty,
+ ArrayRef<Value *> Args) {
+ Function *F;
+ if (CGF.Builder.getIsFPConstrained())
+ F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Ty);
+ else
+ F = CGF.CGM.getIntrinsic(IntrinsicID, Ty);
+
+ if (CGF.Builder.getIsFPConstrained())
+ return CGF.Builder.CreateConstrainedFPCall(F, Args);
+ else
+ return CGF.Builder.CreateCall(F, Args);
+}
+
// Emit a simple mangled intrinsic that has 1 argument and a return type
// matching the argument type.
static Value *emitUnaryBuiltin(CodeGenFunction &CGF,
@@ -4511,13 +4530,20 @@
unsigned shift, bool rightshift) {
unsigned j = 0;
for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
- ai != ae; ++ai, ++j)
+ ai != ae; ++ai, ++j) {
+ if (F->isConstrainedFPIntrinsic())
+ if (ai->getType()->isMetadataTy())
+ continue;
if (shift > 0 && shift == j)
Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
else
Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
+ }
- return Builder.CreateCall(F, Ops, name);
+ if (F->isConstrainedFPIntrinsic())
+ return Builder.CreateConstrainedFPCall(F, Ops, name);
+ else
+ return Builder.CreateCall(F, Ops, name);
}
Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
@@ -5695,13 +5721,14 @@
}
case NEON::BI__builtin_neon_vfma_v:
case NEON::BI__builtin_neon_vfmaq_v: {
- Function *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
// NEON intrinsic puts accumulator first, unlike the LLVM fma.
- return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
+ return emitCallMaybeConstrainedFPBuiltin(
+ *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
+ {Ops[1], Ops[2], Ops[0]});
}
case NEON::BI__builtin_neon_vld1_v:
case NEON::BI__builtin_neon_vld1q_v: {
@@ -5859,7 +5886,9 @@
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
case NEON::BI__builtin_neon_vrndi_v:
case NEON::BI__builtin_neon_vrndiq_v:
- Int = Intrinsic::nearbyint;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_nearbyint
+ : Intrinsic::nearbyint;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
case NEON::BI__builtin_neon_vrshr_n_v:
case NEON::BI__builtin_neon_vrshrq_n_v:
@@ -8459,18 +8488,20 @@
case NEON::BI__builtin_neon_vdivh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(1)));
return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
- case NEON::BI__builtin_neon_vfmah_f16: {
- Function *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+ case NEON::BI__builtin_neon_vfmah_f16:
// NEON intrinsic puts accumulator first, unlike the LLVM fma.
- return Builder.CreateCall(F,
- {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
- }
+ return emitCallMaybeConstrainedFPBuiltin(
+ *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
+ {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
case NEON::BI__builtin_neon_vfmsh_f16: {
- Function *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+ // FIXME: This should be an fneg instruction:
Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy);
Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh");
+
// NEON intrinsic puts accumulator first, unlike the LLVM fma.
- return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
+ return emitCallMaybeConstrainedFPBuiltin(
+ *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
+ {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
}
case NEON::BI__builtin_neon_vaddd_s64:
case NEON::BI__builtin_neon_vaddd_u64:
@@ -8837,7 +8868,8 @@
Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
Ops.pop_back();
- Int = Intrinsic::fma;
+ Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
+ : Intrinsic::fma;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
}
case NEON::BI__builtin_neon_vfma_laneq_v: {
@@ -8850,11 +8882,12 @@
NeonTypeFlags(NeonTypeFlags::Float64, false, true));
Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
- Function *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
- Value *Result = Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
+ Value *Result;
+ Result = emitCallMaybeConstrainedFPBuiltin(
+ *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma,
+ DoubleTy, {Ops[1], Ops[2], Ops[0]});
return Builder.CreateBitCast(Result, Ty);
}
- Function *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
@@ -8865,16 +8898,19 @@
cast<ConstantInt>(Ops[3]));
Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
- return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
+ return emitCallMaybeConstrainedFPBuiltin(
+ *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
+ {Ops[2], Ops[1], Ops[0]});
}
case NEON::BI__builtin_neon_vfmaq_laneq_v: {
- Function *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
- return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
+ return emitCallMaybeConstrainedFPBuiltin(
+ *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
+ {Ops[2], Ops[1], Ops[0]});
}
case NEON::BI__builtin_neon_vfmah_lane_f16:
case NEON::BI__builtin_neon_vfmas_lane_f32:
@@ -8884,9 +8920,10 @@
case NEON::BI__builtin_neon_vfmad_laneq_f64: {
Ops.push_back(EmitScalarExpr(E->getArg(3)));
llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
- Function *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
- return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
+ return emitCallMaybeConstrainedFPBuiltin(
+ *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
+ {Ops[1], Ops[2], Ops[0]});
}
case NEON::BI__builtin_neon_vmull_v:
// FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
@@ -8995,27 +9032,37 @@
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
case NEON::BI__builtin_neon_vrndah_f16: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
- Int = Intrinsic::round;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_round
+ : Intrinsic::round;
return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
}
case NEON::BI__builtin_neon_vrnda_v:
case NEON::BI__builtin_neon_vrndaq_v: {
- Int = Intrinsic::round;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_round
+ : Intrinsic::round;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
}
case NEON::BI__builtin_neon_vrndih_f16: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
- Int = Intrinsic::nearbyint;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_nearbyint
+ : Intrinsic::nearbyint;
return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
}
case NEON::BI__builtin_neon_vrndmh_f16: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
- Int = Intrinsic::floor;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_floor
+ : Intrinsic::floor;
return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
}
case NEON::BI__builtin_neon_vrndm_v:
case NEON::BI__builtin_neon_vrndmq_v: {
- Int = Intrinsic::floor;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_floor
+ : Intrinsic::floor;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
}
case NEON::BI__builtin_neon_vrndnh_f16: {
@@ -9035,32 +9082,44 @@
}
case NEON::BI__builtin_neon_vrndph_f16: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
- Int = Intrinsic::ceil;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_ceil
+ : Intrinsic::ceil;
return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
}
case NEON::BI__builtin_neon_vrndp_v:
case NEON::BI__builtin_neon_vrndpq_v: {
- Int = Intrinsic::ceil;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_ceil
+ : Intrinsic::ceil;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
}
case NEON::BI__builtin_neon_vrndxh_f16: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
- Int = Intrinsic::rint;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_rint
+ : Intrinsic::rint;
return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
}
case NEON::BI__builtin_neon_vrndx_v:
case NEON::BI__builtin_neon_vrndxq_v: {
- Int = Intrinsic::rint;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_rint
+ : Intrinsic::rint;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
}
case NEON::BI__builtin_neon_vrndh_f16: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
- Int = Intrinsic::trunc;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_trunc
+ : Intrinsic::trunc;
return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
}
case NEON::BI__builtin_neon_vrnd_v:
case NEON::BI__builtin_neon_vrndq_v: {
- Int = Intrinsic::trunc;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_trunc
+ : Intrinsic::trunc;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
}
case NEON::BI__builtin_neon_vcvt_f64_v:
@@ -9211,12 +9270,16 @@
}
case NEON::BI__builtin_neon_vsqrth_f16: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
- Int = Intrinsic::sqrt;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_sqrt
+ : Intrinsic::sqrt;
return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
}
case NEON::BI__builtin_neon_vsqrt_v:
case NEON::BI__builtin_neon_vsqrtq_v: {
- Int = Intrinsic::sqrt;
+ Int = Builder.getIsFPConstrained()
+ ? Intrinsic::experimental_constrained_sqrt
+ : Intrinsic::sqrt;
Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
}