AMDGPU: Use source modifiers with f16->f32 conversions

The operand types were defined to fit the fp16_to_fp node, which
has the half as an integer type. v_cvt_f32_f16 does support
source modifiers, so change this to have an FP type and modifiers.

For targets without legal f16, this requires recognizing the
bit operations and trying to produce them.

llvm-svn: 293857
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 70eb3e1..3350654 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -479,6 +479,7 @@
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
+  setTargetDAGCombine(ISD::FABS);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2968,6 +2969,45 @@
     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
   }
+  case ISD::FP16_TO_FP: {
+    // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
+    // f16, but legalization of f16 fneg ends up pulling it out of the source.
+    // Put the fneg back as a legal source operation that can be matched later.
+    SDLoc SL(N);
+
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
+    SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
+                                  DAG.getConstant(0x8000, SL, SrcVT));
+    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+
+  if (!N0.hasOneUse())
+    return SDValue();
+
+  switch (N0.getOpcode()) {
+  case ISD::FP16_TO_FP: {
+    assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
+    SDLoc SL(N);
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
+    SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
+                                  DAG.getConstant(0x7fff, SL, SrcVT));
+    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
+  }
   default:
     return SDValue();
   }
@@ -3080,6 +3120,8 @@
     return performSelectCombine(N, DCI);
   case ISD::FNEG:
     return performFNegCombine(N, DCI);
+  case ISD::FABS:
+    return performFAbsCombine(N, DCI);
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 880d571..58ac09f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -85,6 +85,7 @@
                              SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 004b10f..7d2a52b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -637,6 +637,9 @@
 def umin_oneuse : HasOneUseBinOp<umin>;
 def fminnum_oneuse : HasOneUseBinOp<fminnum>;
 def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+def and_oneuse : HasOneUseBinOp<and>;
+def or_oneuse : HasOneUseBinOp<or>;
+def xor_oneuse : HasOneUseBinOp<xor>;
 } // Properties = [SDNPCommutative, SDNPAssociative]
 
 def sub_oneuse : HasOneUseBinOp<sub>;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 0c5bb06..bdaa5ae 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -226,6 +226,8 @@
   case 8:
     return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
   case 2:
+    // FIXME Is this correct? What do inline immediates do on SI for f16 src
+    // which does not have f16 support?
     return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
   default:
     llvm_unreachable("invalid operand size");
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e718b45..5100658 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1824,7 +1824,8 @@
     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
   case 16:
-    return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+    return ST.has16BitInsts() &&
+           AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
   default:
     llvm_unreachable("invalid bitwidth");
@@ -1854,8 +1855,13 @@
   }
   case 16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+      // A few special case instructions have 16-bit operands on subtargets
+      // where 16-bit instructions are not legal.
+      // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
+      // constants in these cases
       int16_t Trunc = static_cast<int16_t>(Imm);
-      return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+      return ST.has16BitInsts() &&
+             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
     }
 
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index cf9b9c5..a691ef1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -619,6 +619,8 @@
 def SRCMODS {
   int NONE = 0;
   int NEG = 1;
+  int ABS = 2;
+  int NEG_ABS = 3;
 }
 
 def DSTCLAMP {
@@ -1132,6 +1134,8 @@
 def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
 def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
 def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
+def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
 
 def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
 def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5cdd139..ed0609d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -430,9 +430,26 @@
 
 } // End Predicates = [UnsafeFPMath]
 
+
+// f16_to_fp patterns
 def : Pat <
-  (f32 (fpextend f16:$src)),
-  (V_CVT_F32_F16_e32 $src)
+  (f32 (f16_to_fp i32:$src0)),
+  (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+  (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+  (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+  (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
@@ -440,9 +457,10 @@
   (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
 >;
 
+// fp_to_fp16 patterns
 def : Pat <
-  (f16 (fpround f32:$src)),
-  (V_CVT_F16_F32_e32 $src)
+  (i32 (fp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))),
+  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod)
 >;
 
 def : Pat <
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index a15b9ce..15abed4 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -147,8 +147,8 @@
 defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
-defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
-defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
+defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
 defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP_F32_I32>;
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index c64aa62..77c9413 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -19,8 +19,7 @@
 
 ; GCN-LABEL: {{^}}fabs_f16:
 ; CI: flat_load_ushort [[VAL:v[0-9]+]],
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]]
-; CI: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], |[[CVT0]]|
+; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define void @fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
@@ -30,10 +29,10 @@
 
 ; FIXME: Should be able to use single and
 ; GCN-LABEL: {{^}}fabs_v2f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+
+; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 
 ; VI: flat_load_ushort [[LO:v[0-9]+]]
 ; VI: flat_load_ushort [[HI:v[0-9]+]]
@@ -51,10 +50,11 @@
 }
 
 ; GCN-LABEL: {{^}}fabs_v4f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 
 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
 ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
@@ -72,9 +72,10 @@
 ; GCN-LABEL: {{^}}fabs_fold_f16:
 ; GCN: flat_load_ushort [[IN0:v[0-9]+]]
 ; GCN: flat_load_ushort [[IN1:v[0-9]+]]
+
 ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
-; CI-DAG: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], [[IN1]]
-; CI: v_mul_f32_e64 [[RESULT:v[0-9]+]],  |[[CVT1]]|, [[CVT0]]
+; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
+; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]],  [[CVT0]], [[ABS_CVT1]]
 ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index a62726f..8a01ea2 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -28,10 +28,10 @@
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
+; SI:  v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]|
+; SI:  v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]|
 
-; SI:  v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F32]]|, |v[[B_F32]]|
+; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
 ; VI:  v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]|
 
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d7d2131..240fd07 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -3,8 +3,8 @@
 
 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
 ; CI: v_cvt_f32_f16_e32
-; CI: v_cvt_f32_f16_e32
-; CI: v_sub_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}|
+; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
+; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}}
 
 ; VI-NOT: and
 ; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
@@ -17,14 +17,15 @@
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
-; CI: v_cvt_f32_f16_e32
-; CI: v_cvt_f32_f16_e32
-; CI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}|
+; CI-DAG: v_cvt_f32_f16_e32
+; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}|
+; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}}
 ; CI: v_cvt_f16_f32_e32
 
 ; VI-NOT: and
-; VI: v_mul_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}|
-; VI-NOT: and
+; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
+; VI-NOT: [[MUL]]
+; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
 define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.000000e+00, %fabs
@@ -49,10 +50,7 @@
 
 ; FIXME: Should use or
 ; GCN-LABEL: {{^}}fneg_fabs_f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
 define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   %fsub = fsub half -0.000000e+00, %fabs
@@ -61,10 +59,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_fneg_fabs_f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
 define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %val = load half, half addrspace(1)* %in, align 2
   %fabs = call half @llvm.fabs.f16(half %val)
@@ -75,13 +70,10 @@
 
 ; FIXME: single bit op
 ; GCN-LABEL: {{^}}fneg_fabs_v2f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: flat_store_dword
+; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; GCN: store_dword
 define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fsub = fsub <2 x half> <half -0.000000e+00, half -0.000000e+00>, %fabs
@@ -90,17 +82,12 @@
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: flat_store_dwordx2
+; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; GCN: store_dwordx2
 define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   %fsub = fsub <4 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index e3dfd92..d545cc7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -15,13 +15,9 @@
 
 ; FUNC-LABEL: {{^}}v_fneg_f16:
 ; GCN: flat_load_ushort [[VAL:v[0-9]+]],
-
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]]
-; CI: v_cvt_f16_f32_e64 [[CVT1:v[0-9]+]], -[[CVT0]]
-; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
-
-; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
+; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
+; SI: buffer_store_short [[XOR]]
 define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %val = load half, half addrspace(1)* %in, align 2
   %fneg = fsub half -0.000000e+00, %val
@@ -45,8 +41,9 @@
 ; FUNC-LABEL: {{^}}v_fneg_fold_f16:
 ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]]
 
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[CVT0]]
-; CI: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[CVT0]], [[CVT0]]
+; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]
+; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]]
+; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]]
 ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index c4f5d7c..433fdf1 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -68,3 +68,202 @@
   store <2 x double> %r.val, <2 x double> addrspace(1)* %r
   ret void
 }
+
+; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32:
+; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
+define void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) {
+entry:
+  %a.trunc = trunc i32 %a to i16
+  %a.val = bitcast i16 %a.trunc to half
+  %r.val = fpext half %a.val to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]]
+define void @fneg_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]|
+define void @fabs_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]|
+define void @fneg_fabs_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[A]]
+
+; FIXME: Using the source modifier here only wastes code size
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
+
+; GCN: store_dword [[CVT]]
+; GCN: store_short [[XOR]]
+define void @fneg_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.neg, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]]
+; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+
+; VI-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]]
+; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]]
+
+; GCN: buffer_store_dword [[CVTA_NEG]]
+; GCN: buffer_store_short [[MUL]]
+define void @fneg_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  %mul = fmul half %a.neg, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_and_b32_e32 [[XOR:v[0-9]+]], 0x7fff, [[A]]
+
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]|
+
+; GCN: store_dword [[CVT]]
+; GCN: store_short [[XOR]]
+define void @fabs_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.fabs, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], |[[CVTA]]|, [[CVTA]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]]
+
+; VI-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]|
+; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]]
+
+; GCN: buffer_store_dword [[ABS_A]]
+; GCN: buffer_store_short [[MUL]]
+define void @fabs_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  %mul = fmul half %a.fabs, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fneg_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x8000, [[A]]
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[OR]]
+; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[OR]]|
+
+; GCN: buffer_store_dword [[CVT]]
+; GCN: buffer_store_short [[OR]]
+define void @fabs_fneg_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.fneg.fabs, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], -|[[CVTA]]|, [[CVTA]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]]
+
+; VI-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]|
+; VI-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]]
+
+; GCN: buffer_store_dword [[FABS_FNEG]]
+; GCN: buffer_store_short [[MUL]]
+define void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  %mul = fmul half %a.fneg.fabs, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 284fc53..c9905d5 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-; GCN-LABEL: {{^}}fptrunc_f32_to_f16
+; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
@@ -16,7 +16,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_f64_to_f16
+; GCN-LABEL: {{^}}fptrunc_f64_to_f16:
 ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}}
 ; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}}
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
@@ -32,7 +32,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16
+; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16:
 ; GCN:     buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
@@ -51,7 +51,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16
+; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16:
 ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}}
 ; GCN: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
 ; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
@@ -70,3 +70,56 @@
   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
   ret void
 }
+
+; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]]
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define void @fneg_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fneg = fsub float -0.0, %a.val
+  %r.val = fptrunc float %a.fneg to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define void @fabs_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fabs = call float @llvm.fabs.f32(float %a.val)
+  %r.val = fptrunc float %a.fabs to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]|
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define void @fneg_fabs_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fabs = call float @llvm.fabs.f32(float %a.val)
+  %a.fneg.fabs = fsub float -0.0, %a.fabs
+  %r.val = fptrunc float %a.fneg.fabs to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index 20c1d23..413f3f3 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -31,9 +31,10 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_same_add
+; GCN-LABEL: {{^}}mac_f16_same_add:
 ; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; SI:  v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; VI:  v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
@@ -63,9 +64,11 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_a
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_a:
+; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
@@ -87,9 +90,10 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_b
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_b:
+; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
+; SI:     v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
@@ -111,9 +115,12 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_c
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_c:
+; SI: v_cvt_f32_f16_e32
+; SI-DAG: v_cvt_f32_f16_e32
+; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
+; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
@@ -207,9 +214,11 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math:
+; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
@@ -231,9 +240,11 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math:
+; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
@@ -255,9 +266,12 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math:
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
+; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
 ; GCN:    s_endpgm
@@ -279,7 +293,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16
+; GCN-LABEL: {{^}}mac_v2f16:
 ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
@@ -322,7 +336,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_same_add
+; GCN-LABEL: {{^}}mac_v2f16_same_add:
 ; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]]
 ; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]]
 ; SI:  v_mac_f32_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
@@ -358,10 +372,13 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_a
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_a:
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
+
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -385,9 +402,12 @@
 }
 
 ; GCN-LABEL: {{^}}mac_v2f16_neg_b
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
+
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -410,10 +430,13 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_c:
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
+
+; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
@@ -464,7 +487,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math
+; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math:
 ; SI:  v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
@@ -492,7 +515,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math
+; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math:
 ; SI:  v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
@@ -520,10 +543,13 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math:
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
+
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
@@ -546,10 +572,13 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math:
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
+
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
@@ -572,10 +601,13 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math:
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
+; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
+
+; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
diff --git a/llvm/test/MC/AMDGPU/gfx8_asm_all.s b/llvm/test/MC/AMDGPU/gfx8_asm_all.s
index 30e7eee..fcf6e29 100644
--- a/llvm/test/MC/AMDGPU/gfx8_asm_all.s
+++ b/llvm/test/MC/AMDGPU/gfx8_asm_all.s
@@ -24330,14 +24330,16 @@
 v_cvt_f32_f16_e64 v0, 0
 // CHECK: [0x00,0x00,0x4b,0xd1,0x80,0x00,0x00,0x00]
 
+// FIXME: Parsing source modifiers
 v_cvt_f32_f16_e64 v0, -1
-// CHECK: [0x00,0x00,0x4b,0xd1,0xc1,0x00,0x00,0x00]
+// CHECK: [0x00,0x00,0x4b,0xd1,0x81,0x00,0x00,0x20]
 
 v_cvt_f32_f16_e64 v0, 0.5
 // CHECK: [0x00,0x00,0x4b,0xd1,0xf0,0x00,0x00,0x00]
 
+// FIXME: Parsing source modifiers
 v_cvt_f32_f16_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x4b,0xd1,0xf7,0x00,0x00,0x00]
+// CHECK: [0x00,0x00,0x4b,0xd1,0xf6,0x00,0x00,0x20]
 
 v_cvt_f32_f16_e64 v0, v0
 // CHECK: [0x00,0x00,0x4b,0xd1,0x00,0x01,0x00,0x00]