AMDGPU: Redefine clamp node as clamp 0.0-1.0
Change implementation to use max instead of add.
min/max/med3 do not flush denormals regardless of the mode,
so it is OK to use it whether or not they are enabled.
Also allow using clamp with f16, and use knowledge
of dx10_clamp.
llvm-svn: 295788
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 60acf4f..40a2820 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3995,8 +3995,10 @@
return DAG.isKnownNeverNaN(Op);
}
-static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
- SDValue Op0, SDValue Op1) {
+SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
+ const SDLoc &SL,
+ SDValue Op0,
+ SDValue Op1) const {
ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
if (!K1)
return SDValue();
@@ -4010,6 +4012,20 @@
if (Cmp == APFloat::cmpGreaterThan)
return SDValue();
+ // TODO: Check IEEE bit enabled?
+ EVT VT = K0->getValueType(0);
+ if (Subtarget->enableDX10Clamp()) {
+ // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
+ // hardware fmed3 behavior converting to a min.
+ // FIXME: Should this be allowing -0.0?
+ if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
+ return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
+ }
+
+ // No med3 for f16, but clamp is possible.
+ if (VT == MVT::f16)
+ return SDValue();
+
// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
// signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
// give the other result, which is different from med3 with a NaN input.
@@ -4074,7 +4090,9 @@
if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
(Opc == AMDGPUISD::FMIN_LEGACY &&
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
- N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+ (N->getValueType(0) == MVT::f32 ||
+ (N->getValueType(0) == MVT::f16 && Subtarget->has16BitInsts())) &&
+ Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
return Res;
}
@@ -4082,6 +4100,60 @@
return SDValue();
}
+static bool isClampZeroToOne(SDValue A, SDValue B) {
+ if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
+ if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
+ // FIXME: Should this be allowing -0.0?
+ return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
+ (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
+ }
+ }
+
+ return false;
+}
+
+// FIXME: Should only worry about snans for version with chain.
+SDValue SITargetLowering::performFMed3Combine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+ // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
+ // NaNs. With a NaN input, the order of the operands may change the result.
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ SDValue Src0 = N->getOperand(0);
+ SDValue Src1 = N->getOperand(1);
+ SDValue Src2 = N->getOperand(2);
+
+ if (isClampZeroToOne(Src0, Src1)) {
+ // const_a, const_b, x -> clamp is safe in all cases including signaling
+ // nans.
+ // FIXME: Should this be allowing -0.0?
+ return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
+ }
+
+ // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
+ // handling no dx10-clamp?
+ if (Subtarget->enableDX10Clamp()) {
+ // If NaNs is clamped to 0, we are free to reorder the inputs.
+
+ if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+ std::swap(Src0, Src1);
+
+ if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
+ std::swap(Src1, Src2);
+
+ if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+ std::swap(Src0, Src1);
+
+ if (isClampZeroToOne(Src1, Src2))
+ return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
+ }
+
+ return SDValue();
+}
+
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
@@ -4348,6 +4420,8 @@
case AMDGPUISD::CVT_F32_UBYTE2:
case AMDGPUISD::CVT_F32_UBYTE3:
return performCvtF32UByteNCombine(N, DCI);
+ case AMDGPUISD::FMED3:
+ return performFMed3Combine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}