AMDGPU: Use source modifiers with f16->f32 conversions
The operand types were defined to fit the fp16_to_fp node, which
has the half as an integer type. v_cvt_f32_f16 does support
source modifiers, so change this to have an FP type and modifiers.
For targets without legal f16, this requires recognizing the
bit operations and trying to produce them.
llvm-svn: 293857
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 70eb3e1..3350654 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -479,6 +479,7 @@
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG);
+ setTargetDAGCombine(ISD::FABS);
}
//===----------------------------------------------------------------------===//
@@ -2968,6 +2969,45 @@
SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
}
+ case ISD::FP16_TO_FP: {
+ // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
+ // f16, but legalization of f16 fneg ends up pulling it out of the source.
+ // Put the fneg back as a legal source operation that can be matched later.
+ SDLoc SL(N);
+
+ SDValue Src = N0.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
+ SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
+ DAG.getConstant(0x8000, SL, SrcVT));
+ return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
+ }
+ default:
+ return SDValue();
+ }
+}
+
+SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ switch (N0.getOpcode()) {
+ case ISD::FP16_TO_FP: {
+ assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
+ SDLoc SL(N);
+ SDValue Src = N0.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
+ SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
+ DAG.getConstant(0x7fff, SL, SrcVT));
+ return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
+ }
default:
return SDValue();
}
@@ -3080,6 +3120,8 @@
return performSelectCombine(N, DCI);
case ISD::FNEG:
return performFNegCombine(N, DCI);
+ case ISD::FABS:
+ return performFAbsCombine(N, DCI);
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
assert(!N->getValueType(0).isVector() &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 880d571..58ac09f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -85,6 +85,7 @@
SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 004b10f..7d2a52b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -637,6 +637,9 @@
def umin_oneuse : HasOneUseBinOp<umin>;
def fminnum_oneuse : HasOneUseBinOp<fminnum>;
def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+def and_oneuse : HasOneUseBinOp<and>;
+def or_oneuse : HasOneUseBinOp<or>;
+def xor_oneuse : HasOneUseBinOp<xor>;
} // Properties = [SDNPCommutative, SDNPAssociative]
def sub_oneuse : HasOneUseBinOp<sub>;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 0c5bb06..bdaa5ae 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -226,6 +226,8 @@
case 8:
return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
case 2:
+ // FIXME Is this correct? What do inline immediates do on SI for f16 src
+ // which does not have f16 support?
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
default:
llvm_unreachable("invalid operand size");
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e718b45..5100658 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1824,7 +1824,8 @@
return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
ST.hasInv2PiInlineImm());
case 16:
- return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+ return ST.has16BitInsts() &&
+ AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
ST.hasInv2PiInlineImm());
default:
llvm_unreachable("invalid bitwidth");
@@ -1854,8 +1855,13 @@
}
case 16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+ // A few special case instructions have 16-bit operands on subtargets
+ // where 16-bit instructions are not legal.
+ // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
+ // constants in these cases
int16_t Trunc = static_cast<int16_t>(Imm);
- return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+ return ST.has16BitInsts() &&
+ AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index cf9b9c5..a691ef1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -619,6 +619,8 @@
def SRCMODS {
int NONE = 0;
int NEG = 1;
+ int ABS = 2;
+ int NEG_ABS = 3;
}
def DSTCLAMP {
@@ -1132,6 +1134,8 @@
def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
+def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5cdd139..ed0609d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -430,9 +430,26 @@
} // End Predicates = [UnsafeFPMath]
+
+// f16_to_fp patterns
def : Pat <
- (f32 (fpextend f16:$src)),
- (V_CVT_F32_F16_e32 $src)
+ (f32 (f16_to_fp i32:$src0)),
+ (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+ (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+ (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+ (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+ (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+ (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+ (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
def : Pat <
@@ -440,9 +457,10 @@
(V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
+// fp_to_fp16 patterns
def : Pat <
- (f16 (fpround f32:$src)),
- (V_CVT_F16_F32_e32 $src)
+ (i32 (fp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))),
+ (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod)
>;
def : Pat <
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index a15b9ce..15abed4 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -147,8 +147,8 @@
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
-defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
-defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
+defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>;