AMDGPU: Start selecting v_mad_mixhi_f16

llvm-svn: 313814
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1a30a16..9fc38ae 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -775,6 +775,7 @@
   return true;
 }
 
+// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
   const MachineOperand *ClampSrc = isClamp(MI);
   if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 99f7bad..6a751d7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -502,6 +502,7 @@
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -5853,7 +5854,7 @@
   SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Vec = N->getOperand(0);
 
-  SelectionDAG &DAG= DCI.DAG;
+  SelectionDAG &DAG = DCI.DAG;
   if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
     SDLoc SL(N);
     EVT EltVT = N->getValueType(0);
@@ -5866,6 +5867,47 @@
   return SDValue();
 }
 
+static bool convertBuildVectorCastElt(SelectionDAG &DAG,
+                                      SDValue &Lo, SDValue &Hi) {
+  if (Hi.getOpcode() == ISD::BITCAST &&
+      Hi.getOperand(0).getValueType() == MVT::f16 &&
+      (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
+    Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
+    Hi = Hi.getOperand(0);
+    return true;
+  }
+
+  return false;
+}
+
+SDValue SITargetLowering::performBuildVectorCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDLoc SL(N);
+
+  if (!isTypeLegal(MVT::v2i16))
+    return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (VT == MVT::v2i16) {
+    SDValue Lo = N->getOperand(0);
+    SDValue Hi = N->getOperand(1);
+
+    // v2i16 build_vector (const|undef), (bitcast f16:$x)
+    // -> bitcast (v2f16 build_vector const|undef, $x
+    if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
+      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
+      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
+    }
+
+    if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
+      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
+      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
+    }
+  }
+
+  return SDValue();
+}
 
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
@@ -6287,6 +6329,8 @@
   }
   case ISD::EXTRACT_VECTOR_ELT:
     return performExtractVectorEltCombine(N, DCI);
+  case ISD::BUILD_VECTOR:
+    return performBuildVectorCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index ad38eb6..91380f8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -111,6 +111,7 @@
   SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index b7aa2a9..313792f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -76,8 +76,11 @@
 
 // Clamp modifier is applied after conversion to f16.
 def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+
+let ClampLo = 0, ClampHi = 1 in {
 def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 }
+}
 
 let Predicates = [HasMadMix] in {
 
@@ -88,10 +91,56 @@
   (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
                    $src1_modifiers, $src1,
                    $src2_modifiers, $src2,
-                   0,
+                   DSTCLAMP.NONE,
                    (i32 (IMPLICIT_DEF)))
 >;
 
+// FIXME: Special case handling for maxhi (especially for clamp)
+// because dealing with the write to high half of the register is
+// difficult.
+def : Pat <
+  (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                          (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                          (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+  (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
+                          $src1_modifiers, $src1,
+                          $src2_modifiers, $src2,
+                          DSTCLAMP.NONE,
+                          $elt0))
+>;
+
+def : Pat <
+  (build_vector
+    f16:$elt0,
+    (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+  (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
+                          $src1_modifiers, $src1,
+                          $src2_modifiers, $src2,
+                          DSTCLAMP.ENABLE,
+                          $elt0))
+>;
+
+def : Pat <
+  (AMDGPUclamp (build_vector
+    (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+                   (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
+                   (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
+    (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+                   (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
+                   (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+  (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
+                          $hi_src1_modifiers, $hi_src1,
+                          $hi_src2_modifiers, $hi_src2,
+                          DSTCLAMP.ENABLE,
+                          (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
+                                           $lo_src1_modifiers, $lo_src1,
+                                           $lo_src2_modifiers, $lo_src2,
+                                           DSTCLAMP.ENABLE,
+                                           (i32 (IMPLICIT_DEF)))))
+>;
+
 } // End Predicates = [HasMadMix]
 
 multiclass VOP3P_Real_vi<bits<10> op> {