AMDGPU: Start selecting v_mad_mixhi_f16
llvm-svn: 313814
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1a30a16..9fc38ae 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -775,6 +775,7 @@
return true;
}
+// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
const MachineOperand *ClampSrc = isClamp(MI);
if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 99f7bad..6a751d7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -502,6 +502,7 @@
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -5853,7 +5854,7 @@
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
- SelectionDAG &DAG= DCI.DAG;
+ SelectionDAG &DAG = DCI.DAG;
if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
SDLoc SL(N);
EVT EltVT = N->getValueType(0);
@@ -5866,6 +5867,47 @@
return SDValue();
}
+static bool convertBuildVectorCastElt(SelectionDAG &DAG,
+ SDValue &Lo, SDValue &Hi) {
+ if (Hi.getOpcode() == ISD::BITCAST &&
+ Hi.getOperand(0).getValueType() == MVT::f16 &&
+ (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
+ Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
+ Hi = Hi.getOperand(0);
+ return true;
+ }
+
+ return false;
+}
+
+SDValue SITargetLowering::performBuildVectorCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDLoc SL(N);
+
+ if (!isTypeLegal(MVT::v2i16))
+ return SDValue();
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT == MVT::v2i16) {
+ SDValue Lo = N->getOperand(0);
+ SDValue Hi = N->getOperand(1);
+
+ // v2i16 build_vector (const|undef), (bitcast f16:$x)
+ // -> bitcast (v2f16 build_vector const|undef, $x
+ if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
+ SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
+ }
+
+ if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
+ SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
+ return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
+ }
+ }
+
+ return SDValue();
+}
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
@@ -6287,6 +6329,8 @@
}
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI);
+ case ISD::BUILD_VECTOR:
+ return performBuildVectorCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index ad38eb6..91380f8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -111,6 +111,7 @@
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index b7aa2a9..313792f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -76,8 +76,11 @@
// Clamp modifier is applied after conversion to f16.
def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+
+let ClampLo = 0, ClampHi = 1 in {
def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
}
+}
let Predicates = [HasMadMix] in {
@@ -88,10 +91,56 @@
(V_MAD_MIXLO_F16 $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
- 0,
+ DSTCLAMP.NONE,
(i32 (IMPLICIT_DEF)))
>;
+// FIXME: Special case handling for maxhi (especially for clamp)
+// because dealing with the write to high half of the register is
+// difficult.
+def : Pat <
+ (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ $elt0))
+>;
+
+def : Pat <
+ (build_vector
+ f16:$elt0,
+ (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+ (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.ENABLE,
+ $elt0))
+>;
+
+def : Pat <
+ (AMDGPUclamp (build_vector
+ (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
+ (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+ (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
+ $hi_src1_modifiers, $hi_src1,
+ $hi_src2_modifiers, $hi_src2,
+ DSTCLAMP.ENABLE,
+ (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
+ $lo_src1_modifiers, $lo_src1,
+ $lo_src2_modifiers, $lo_src2,
+ DSTCLAMP.ENABLE,
+ (i32 (IMPLICIT_DEF)))))
+>;
+
} // End Predicates = [HasMadMix]
multiclass VOP3P_Real_vi<bits<10> op> {