[AMDGPU] Add intrinsics for 16 bit interpolation

Summary:
Added the intrinsics llvm.amdgcn.interp.p1.f16() and
llvm.amdgcn.interp.p2.f16() and related LIT test.

The p1 intrinsic generates code appropriate for both 16 and 32
bank LDS.

Reviewers: #amdgpu, dstuttard, arsenm, tpr

Reviewed By: #amdgpu, arsenm

Subscribers: jvesely, mgorny, arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D46754

llvm-svn: 352357
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ee2ab60..88f475a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1165,6 +1165,20 @@
             [IntrNoMem, IntrSpeculatable]>;
           // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
 
+// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
+def int_amdgcn_interp_p1_f16 :
+  GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">,
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
+def int_amdgcn_interp_p2_f16 :
+  GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">,
+  Intrinsic<[llvm_half_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
 // Pixel shaders only: whether the current pixel is live (i.e. not a helper
 // invocation for derivative computation).
 def int_amdgcn_ps_live : Intrinsic <
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f170bc5..a15b2b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4183,6 +4183,9 @@
   NODE_NAME_CASE(INTERP_MOV)
   NODE_NAME_CASE(INTERP_P1)
   NODE_NAME_CASE(INTERP_P2)
+  NODE_NAME_CASE(INTERP_P1LL_F16)
+  NODE_NAME_CASE(INTERP_P1LV_F16)
+  NODE_NAME_CASE(INTERP_P2_F16)
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 557479a..8b8e16e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -462,6 +462,9 @@
   INTERP_MOV,
   INTERP_P1,
   INTERP_P2,
+  INTERP_P1LL_F16,
+  INTERP_P1LV_F16,
+  INTERP_P2_F16,
   PC_ADD_REL_OFFSET,
   KILL,
   DUMMY_CHAIN,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 3702c3f..73ae5de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -369,6 +369,17 @@
                       SDTypeProfile<1, 4, [SDTCisFP<0>]>,
                       [SDNPInGlue]>;
 
+def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
+                            SDTypeProfile<1, 7, [SDTCisFP<0>]>,
+                            [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16",
+                            SDTypeProfile<1, 9, [SDTCisFP<0>]>,
+                            [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16",
+                          SDTypeProfile<1, 8, [SDTCisFP<0>]>,
+                          [SDNPInGlue]>;
 
 def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
   [SDNPHasChain, SDNPSideEffect]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index cca656b..916b1ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -48,6 +48,8 @@
 def : SourceOfDivergence<int_amdgcn_interp_mov>;
 def : SourceOfDivergence<int_amdgcn_interp_p1>;
 def : SourceOfDivergence<int_amdgcn_interp_p2>;
+def : SourceOfDivergence<int_amdgcn_interp_p1_f16>;
+def : SourceOfDivergence<int_amdgcn_interp_p2_f16>;
 def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
 def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
 def : SourceOfDivergence<int_r600_read_tidig_x>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 117014d..0c1fca8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5292,6 +5292,59 @@
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
                        Glue);
   }
+  case Intrinsic::amdgcn_interp_p1_f16: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+    SDValue Glue = M0.getValue(1);
+    if (getSubtarget()->getLDSBankCount() == 16) {
+      // 16 bank LDS
+      SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
+                              DAG.getConstant(2, DL, MVT::i32), // P0
+                              Op.getOperand(2), // Attrchan
+                              Op.getOperand(3), // Attr
+                              Glue);
+      SDValue Ops[] = {
+        Op.getOperand(1), // Src0
+        Op.getOperand(2), // Attrchan
+        Op.getOperand(3), // Attr
+        DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+        S, // Src2 - holds two f16 values selected by high
+        DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+        Op.getOperand(4), // high
+        DAG.getConstant(0, DL, MVT::i1), // $clamp
+        DAG.getConstant(0, DL, MVT::i32) // $omod
+      };
+      return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
+    } else {
+      // 32 bank LDS
+      SDValue Ops[] = {
+        Op.getOperand(1), // Src0
+        Op.getOperand(2), // Attrchan
+        Op.getOperand(3), // Attr
+        DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+        Op.getOperand(4), // high
+        DAG.getConstant(0, DL, MVT::i1), // $clamp
+        DAG.getConstant(0, DL, MVT::i32), // $omod
+        Glue
+      };
+      return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
+    }
+  }
+  case Intrinsic::amdgcn_interp_p2_f16: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
+    SDValue Glue = SDValue(M0.getNode(), 1);
+    SDValue Ops[] = {
+      Op.getOperand(2), // Src0
+      Op.getOperand(3), // Attrchan
+      Op.getOperand(4), // Attr
+      DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+      Op.getOperand(1), // Src2
+      DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+      Op.getOperand(5), // high
+      DAG.getConstant(0, DL, MVT::i1), // $clamp
+      Glue
+    };
+    return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
+  }
   case Intrinsic::amdgcn_sin:
     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index da31f78..babd041 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -462,7 +462,14 @@
 let FPDPRounding = 1 in {
 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
 let Uses = [M0, EXEC] in {
-def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
+def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
+       [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan),
+                                                        (i32 imm:$attr),
+                                                        (i32 imm:$src0_modifiers),
+                                                        (f32 VRegSrc_32:$src2),
+                                                        (i32 imm:$src2_modifiers),
+                                                        (i1 imm:$high),
+                                                        (i1 imm:$clamp)))]>;
 } // End Uses = [M0, EXEC]
 } // End FPDPRounding = 1
 } // End renamedInGFX9 = 1
@@ -477,8 +484,22 @@
 } // End SubtargetPredicate = isGFX9
 
 let Uses = [M0, EXEC], FPDPRounding = 1 in {
-def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
-def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
+       [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan),
+                                                          (i32 imm:$attr),
+                                                          (i32 imm:$src0_modifiers),
+                                                          (i1 imm:$high),
+                                                          (i1 imm:$clamp),
+                                                          (i32 imm:$omod)))]>;
+def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
+       [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan),
+                                                          (i32 imm:$attr),
+                                                          (i32 imm:$src0_modifiers),
+                                                          (f32 VRegSrc_32:$src2),
+                                                          (i32 imm:$src2_modifiers),
+                                                          (i1 imm:$high),
+                                                          (i1 imm:$clamp),
+                                                          (i32 imm:$omod)))]>;
 } // End Uses = [M0, EXEC], FPDPRounding = 1
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/interp_f16.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/interp_f16.ll
new file mode 100644
index 0000000..174dd56
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/interp_f16.ll
@@ -0,0 +1,25 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; CHECK: for function 'interp_p1_f16'
+; CHECK: DIVERGENT:       %p1 = call float @llvm.amdgcn.interp.p1.f16
+define amdgpu_ps float @interp_p1_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+main_body:
+  %p1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
+  ret float %p1
+}
+
+; CHECK: for function 'interp_p2_f16'
+; CHECK: DIVERGENT:       %p2 = call half @llvm.amdgcn.interp.p2.f16
+define amdgpu_ps half @interp_p2_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+main_body:
+  %p2 = call half @llvm.amdgcn.interp.p2.f16(float %i, float %j, i32 1, i32 2, i1 0, i32 %m0)
+  ret half %p2
+}
+
+; float @llvm.amdgcn.interp.p1.f16(i, attrchan, attr, high, m0)
+declare float @llvm.amdgcn.interp.p1.f16(float, i32, i32, i1, i32) #0
+; half @llvm.amdgcn.interp.p1.f16(p1, j, attrchan, attr, high, m0)
+declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll
new file mode 100644
index 0000000..fb08e6d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-32BANK %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-32BANK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-16BANK %s
+
+define amdgpu_ps half @interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GFX9-32BANK-LABEL: interp_f16:
+; GFX9-32BANK:       ; %bb.0: ; %main_body
+; GFX9-32BANK-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-32BANK-NEXT:    s_mov_b32 m0, s2
+; GFX9-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX9-32BANK-NEXT:    v_interp_p1ll_f16 v1, v0, attr2.y
+; GFX9-32BANK-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-32BANK-NEXT:    v_interp_p1ll_f16 v0, v0, attr2.y high
+; GFX9-32BANK-NEXT:    v_interp_p2_legacy_f16 v1, v2, attr2.y, v1
+; GFX9-32BANK-NEXT:    v_interp_p2_legacy_f16 v0, v2, attr2.y, v0 high
+; GFX9-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX9-32BANK-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-32BANK-NEXT:    ; return to shader part epilog
+;
+; GFX8-32BANK-LABEL: interp_f16:
+; GFX8-32BANK:       ; %bb.0: ; %main_body
+; GFX8-32BANK-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-32BANK-NEXT:    s_mov_b32 m0, s2
+; GFX8-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX8-32BANK-NEXT:    v_interp_p1ll_f16 v1, v0, attr2.y
+; GFX8-32BANK-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-32BANK-NEXT:    v_interp_p1ll_f16 v0, v0, attr2.y high
+; GFX8-32BANK-NEXT:    v_interp_p2_f16 v1, v2, attr2.y, v1
+; GFX8-32BANK-NEXT:    v_interp_p2_f16 v0, v2, attr2.y, v0 high
+; GFX8-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX8-32BANK-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX8-32BANK-NEXT:    ; return to shader part epilog
+;
+; GFX8-16BANK-LABEL: interp_f16:
+; GFX8-16BANK:       ; %bb.0: ; %main_body
+; GFX8-16BANK-NEXT:    s_mov_b32 m0, s2
+; GFX8-16BANK-NEXT:    v_interp_mov_f32_e32 v0, p0, attr2.y
+; GFX8-16BANK-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-16BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX8-16BANK-NEXT:    v_interp_p1lv_f16 v2, v1, attr2.y, v0
+; GFX8-16BANK-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-16BANK-NEXT:    v_interp_p1lv_f16 v0, v1, attr2.y, v0 high
+; GFX8-16BANK-NEXT:    v_interp_p2_f16 v2, v3, attr2.y, v2
+; GFX8-16BANK-NEXT:    v_interp_p2_f16 v0, v3, attr2.y, v0 high
+; GFX8-16BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX8-16BANK-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX8-16BANK-NEXT:    ; return to shader part epilog
+main_body:
+  %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
+  %p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0)
+  %p1_1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 1, i32 %m0)
+  %p2_1 = call half @llvm.amdgcn.interp.p2.f16(float %p1_1, float %j, i32 1, i32 2, i1 1, i32 %m0)
+  %res = fadd half %p2_0, %p2_1
+  ret half %res
+}
+
+; check that m0 is setup correctly before the interp p1 instruction
+define amdgpu_ps half @interp_p1_m0_setup(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GFX9-32BANK-LABEL: interp_p1_m0_setup:
+; GFX9-32BANK:       ; %bb.0: ; %main_body
+; GFX9-32BANK-NEXT:    ;;#ASMSTART
+; GFX9-32BANK-NEXT:    s_mov_b32 m0, 0
+; GFX9-32BANK-NEXT:    ;;#ASMEND
+; GFX9-32BANK-NEXT:    s_mov_b32 s3, m0
+; GFX9-32BANK-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-32BANK-NEXT:    s_mov_b32 m0, s2
+; GFX9-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX9-32BANK-NEXT:    v_interp_p1ll_f16 v0, v0, attr2.y
+; GFX9-32BANK-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-32BANK-NEXT:    v_interp_p2_legacy_f16 v0, v1, attr2.y, v0
+; GFX9-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX9-32BANK-NEXT:    v_add_f16_e32 v0, s3, v0
+; GFX9-32BANK-NEXT:    ; return to shader part epilog
+;
+; GFX8-32BANK-LABEL: interp_p1_m0_setup:
+; GFX8-32BANK:       ; %bb.0: ; %main_body
+; GFX8-32BANK-NEXT:    ;;#ASMSTART
+; GFX8-32BANK-NEXT:    s_mov_b32 m0, 0
+; GFX8-32BANK-NEXT:    ;;#ASMEND
+; GFX8-32BANK-NEXT:    s_mov_b32 s3, m0
+; GFX8-32BANK-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-32BANK-NEXT:    s_mov_b32 m0, s2
+; GFX8-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX8-32BANK-NEXT:    v_interp_p1ll_f16 v0, v0, attr2.y
+; GFX8-32BANK-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-32BANK-NEXT:    v_interp_p2_f16 v0, v1, attr2.y, v0
+; GFX8-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX8-32BANK-NEXT:    v_add_f16_e32 v0, s3, v0
+; GFX8-32BANK-NEXT:    ; return to shader part epilog
+;
+; GFX8-16BANK-LABEL: interp_p1_m0_setup:
+; GFX8-16BANK:       ; %bb.0: ; %main_body
+; GFX8-16BANK-NEXT:    ;;#ASMSTART
+; GFX8-16BANK-NEXT:    s_mov_b32 m0, 0
+; GFX8-16BANK-NEXT:    ;;#ASMEND
+; GFX8-16BANK-NEXT:    s_mov_b32 s3, m0
+; GFX8-16BANK-NEXT:    s_mov_b32 m0, s2
+; GFX8-16BANK-NEXT:    v_interp_mov_f32_e32 v0, p0, attr2.y
+; GFX8-16BANK-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-16BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX8-16BANK-NEXT:    v_interp_p1lv_f16 v0, v1, attr2.y, v0
+; GFX8-16BANK-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-16BANK-NEXT:    v_interp_p2_f16 v0, v1, attr2.y, v0
+; GFX8-16BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX8-16BANK-NEXT:    v_add_f16_e32 v0, s3, v0
+; GFX8-16BANK-NEXT:    ; return to shader part epilog
+main_body:
+  %mx = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
+  %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
+  %p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0)
+  %my = trunc i32 %mx to i16
+  %mh = bitcast i16 %my to half
+  %res = fadd half %p2_0, %mh
+  ret half %res
+}
+
+; check that m0 is setup correctly before the interp p2 instruction
+define amdgpu_ps half @interp_p2_m0_setup(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GFX9-32BANK-LABEL: interp_p2_m0_setup:
+; GFX9-32BANK:       ; %bb.0: ; %main_body
+; GFX9-32BANK-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-32BANK-NEXT:    s_mov_b32 m0, s2
+; GFX9-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX9-32BANK-NEXT:    v_interp_p1ll_f16 v0, v0, attr2.y
+; GFX9-32BANK-NEXT:    ;;#ASMSTART
+; GFX9-32BANK-NEXT:    s_mov_b32 m0, 0
+; GFX9-32BANK-NEXT:    ;;#ASMEND
+; GFX9-32BANK-NEXT:    s_mov_b32 s0, m0
+; GFX9-32BANK-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-32BANK-NEXT:    s_mov_b32 m0, s2
+; GFX9-32BANK-NEXT:    v_interp_p2_legacy_f16 v0, v1, attr2.y, v0
+; GFX9-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX9-32BANK-NEXT:    v_add_f16_e32 v0, s0, v0
+; GFX9-32BANK-NEXT:    ; return to shader part epilog
+;
+; GFX8-32BANK-LABEL: interp_p2_m0_setup:
+; GFX8-32BANK:       ; %bb.0: ; %main_body
+; GFX8-32BANK-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-32BANK-NEXT:    s_mov_b32 m0, s2
+; GFX8-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX8-32BANK-NEXT:    v_interp_p1ll_f16 v0, v0, attr2.y
+; GFX8-32BANK-NEXT:    ;;#ASMSTART
+; GFX8-32BANK-NEXT:    s_mov_b32 m0, 0
+; GFX8-32BANK-NEXT:    ;;#ASMEND
+; GFX8-32BANK-NEXT:    s_mov_b32 s0, m0
+; GFX8-32BANK-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-32BANK-NEXT:    s_mov_b32 m0, s2
+; GFX8-32BANK-NEXT:    v_interp_p2_f16 v0, v1, attr2.y, v0
+; GFX8-32BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX8-32BANK-NEXT:    v_add_f16_e32 v0, s0, v0
+; GFX8-32BANK-NEXT:    ; return to shader part epilog
+;
+; GFX8-16BANK-LABEL: interp_p2_m0_setup:
+; GFX8-16BANK:       ; %bb.0: ; %main_body
+; GFX8-16BANK-NEXT:    s_mov_b32 m0, s2
+; GFX8-16BANK-NEXT:    v_interp_mov_f32_e32 v0, p0, attr2.y
+; GFX8-16BANK-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-16BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX8-16BANK-NEXT:    v_interp_p1lv_f16 v0, v1, attr2.y, v0
+; GFX8-16BANK-NEXT:    ;;#ASMSTART
+; GFX8-16BANK-NEXT:    s_mov_b32 m0, 0
+; GFX8-16BANK-NEXT:    ;;#ASMEND
+; GFX8-16BANK-NEXT:    s_mov_b32 s0, m0
+; GFX8-16BANK-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-16BANK-NEXT:    s_mov_b32 m0, s2
+; GFX8-16BANK-NEXT:    v_interp_p2_f16 v0, v1, attr2.y, v0
+; GFX8-16BANK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX8-16BANK-NEXT:    v_add_f16_e32 v0, s0, v0
+; GFX8-16BANK-NEXT:    ; return to shader part epilog
+main_body:
+  %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
+  %mx = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
+  %p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0)
+  %my = trunc i32 %mx to i16
+  %mh = bitcast i16 %my to half
+  %res = fadd half %p2_0, %mh
+  ret half %res
+}
+
+; float @llvm.amdgcn.interp.p1.f16(i, attrchan, attr, high, m0)
+declare float @llvm.amdgcn.interp.p1.f16(float, i32, i32, i1, i32) #0
+; half @llvm.amdgcn.interp.p1.f16(p1, j, attrchan, attr, high, m0)
+declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }