[AMDGPU] use v32f32 for 3 mfma intrinsics These should really use v32f32, but were defined as v32i32 due to the lack of the v32f32 type. Differential Revision: https://reviews.llvm.org/D64667 llvm-svn: 365972

commit: 1dfae6fe505ffedf97e9f36d207cb8bbdc9255d8 [log] [tgz]
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Fri Jul 12 22:42:01 2019 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Fri Jul 12 22:42:01 2019 +0000
tree: 2d950113da5ca558c53c071b4c41b2e46f2e09f4
parent: b1bff76e22bd39eb46dcae49891fda1cf1cc0bd5 [diff]
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 62e94108..8276d75 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td

@@ -261,6 +261,7 @@
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
 def llvm_v8f32_ty      : LLVMType<v8f32>;    //  8 x float
 def llvm_v16f32_ty     : LLVMType<v16f32>;   // 16 x float
+def llvm_v32f32_ty     : LLVMType<v32f32>;   // 32 x float
 def llvm_v1f64_ty      : LLVMType<v1f64>;    //  1 x double
 def llvm_v2f64_ty      : LLVMType<v2f64>;    //  2 x double
 def llvm_v4f64_ty      : LLVMType<v4f64>;    //  4 x double

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 56878e1..43e827e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

@@ -1663,8 +1663,8 @@
 def int_amdgcn_global_atomic_fadd    : AMDGPUGlobalAtomicNoRtn;
 
 // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
-def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x1f32 : Intrinsic<[llvm_v16f32_ty],
@@ -1683,8 +1683,8 @@
   [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
-def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x4f16 : Intrinsic<[llvm_v16f32_ty],
@@ -1723,8 +1723,8 @@
   [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
-def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x2bf16 : Intrinsic<[llvm_v16f32_ty],

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 56922b0..14ae629 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

@@ -165,6 +165,9 @@
   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 
+  setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
+
   setOperationAction(ISD::LOAD, MVT::i64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
 
@@ -256,6 +259,9 @@
   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 
+  setOperationAction(ISD::STORE, MVT::v32f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
+
   setOperationAction(ISD::STORE, MVT::i64, Promote);
   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
 
@@ -355,7 +361,10 @@
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3eb1b1c..b90a0d2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

@@ -153,6 +153,7 @@
 
   if (Subtarget->hasMAIInsts()) {
     addRegisterClass(MVT::v32i32, &AMDGPU::AReg_1024RegClass);
+    addRegisterClass(MVT::v32f32, &AMDGPU::AReg_1024RegClass);
   }
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -263,8 +264,9 @@
 
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
-        MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
+  for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
+                  MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+                  MVT::v32i32, MVT::v32f32 }) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 98928f0..c382c81 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td

@@ -2178,14 +2178,13 @@
 
 def VOP_V4F32_F32_F32_V4F32       : VOPProfile <[v4f32,  f32,   f32,   v4f32]>;
 def VOP_V16F32_F32_F32_V16F32     : VOPProfile <[v16f32, f32,   f32,   v16f32]>;
-// TODO: define v32f32
-def VOP_V32F32_F32_F32_V32F32     : VOPProfile <[v32i32, f32,   f32,   v32i32]>;
+def VOP_V32F32_F32_F32_V32F32     : VOPProfile <[v32f32, f32,   f32,   v32f32]>;
 def VOP_V4F32_V4F16_V4F16_V4F32   : VOPProfile <[v4f32,  v4f16, v4f16, v4f32]>;
 def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>;
-def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32i32, v4f16, v4f16, v32i32]>;
+def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>;
 def VOP_V4F32_V2I16_V2I16_V4F32   : VOPProfile <[v4f32,  v2i16, v2i16, v4f32]>;
 def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>;
-def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32i32, v2i16, v2i16, v32i32]>;
+def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>;
 def VOP_V4I32_I32_I32_V4I32       : VOPProfile <[v4i32,  i32,   i32,   v4i32]>;
 def VOP_V16I32_I32_I32_V16I32     : VOPProfile <[v16i32, i32,   i32,   v16i32]>;
 def VOP_V32I32_I32_I32_V32I32     : VOPProfile <[v32i32, i32,   i32,   v32i32]>;

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index fd4b6f5..70f20bb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td

@@ -942,6 +942,14 @@
   def Insert_Element_v32i32_#Index : Insert_Element <
     i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
   >;
+
+  def Extract_Element_v32f32_#Index : Extract_Element <
+    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Insert_Element_v32f32_#Index : Insert_Element <
+    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
 }
 
 // FIXME: Why do only some of these type combinations for SReg and
@@ -1034,6 +1042,10 @@
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
 
+// 1024-bit bitcast
+def : BitConvert <v32i32, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v32i32, VReg_1024>;
+
 /********** =================== **********/
 /********** Src & Dst modifiers **********/
 /********** =================== **********/

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 14d41d8..4767f3c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td

@@ -757,11 +757,11 @@
   let isAllocatable = 0;
 }
 
-def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add SGPR_1024Regs)> {
+def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add SGPR_1024Regs)> {
   let AllocationPriority = 19;
 }
 
-def SReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32,
+def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
   (add SGPR_1024)> {
   let CopyCost = 16;
   let AllocationPriority = 19;
@@ -812,7 +812,7 @@
   let AllocationPriority = 7;
 }
 
-def VReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add VGPR_1024)> {
+def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add VGPR_1024)> {
   let Size = 1024;
   let CopyCost = 32;
   let AllocationPriority = 8;
@@ -840,7 +840,7 @@
 }
 
 // TODO: add v32f32 value type
-def AReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add AGPR_1024)> {
+def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add AGPR_1024)> {
   let Size = 1024;
   let CopyCost = 65;
   let AllocationPriority = 8;

diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index ab4fcc5..dfedd24 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll

@@ -1,15 +1,15 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
 ; GCN-LABEL: {{^}}test_32_agprs:
 ; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, 0
 ; GCN-NOT: v28
 ; GCN: NumVgprs: 32
 ; GCN: VGPRBlocks: 7
-define amdgpu_kernel void @test_32_agprs(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_32_agprs(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 0ce0877..5ac0363 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll

@@ -1,11 +1,11 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
@@ -15,7 +15,7 @@
 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
@@ -100,11 +100,11 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -326,14 +326,14 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x i32> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
+define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
   %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
   %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
   %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -794,13 +794,13 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -957,12 +957,12 @@
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc:
 ; GCN:      v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
 ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 0, i32 0, i32 0)
-  %mai.2 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %mai.1, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.2, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.2, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1112,10 +1112,10 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1184,7 +1184,7 @@
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm:
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1
+; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
@@ -1256,10 +1256,10 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1350,12 +1350,12 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %gep
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index b12a7bc7..9c7279a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll

@@ -84,23 +84,23 @@
 ; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
 ; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; A2V:        ScratchSize: 0
-define amdgpu_kernel void @max_32regs_mfma32(i32 addrspace(1)* %arg) #3 {
+define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
 bb:
   %v = call i32 asm sideeffect "", "=a"()
   br label %use
 
 use:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 2>, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0)
   call void asm sideeffect "", "a"(i32 %v)
-  %elt1 = extractelement <32 x i32> %mai.1, i32 0
-  store i32 %elt1, i32 addrspace(1)* %arg
+  %elt1 = extractelement <32 x float> %mai.1, i32 0
+  store float %elt1, float addrspace(1)* %arg
   ret void
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
 attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
 attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }

diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index b101e41..6eef782 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll

@@ -233,19 +233,23 @@
   ret void
 }
 
+; FIXME: adding an AReg_1024 register class for v32f32 and v32i32
+;        produces unnecessary copies and we still have some amount
+;        of conventional spilling.
+
 ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GFX908-NOT: SCRATCH_RSRC
+; GFX908-FIXME-NOT: SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_write_b32 a0, v
 ; GFX900:     buffer_store_dword v
 ; GFX900:     buffer_load_dword v
-; GFX908-NOT: buffer_
+; GFX908-FIXME-NOT: buffer_
 ; GFX908-DAG  v_accvgpr_read_b32
 
 ; GCN:    NumVgprs: 256
 ; GFX900: ScratchSize: 580
-; GFX908: ScratchSize: 0
+; GFX908-FIXME: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
commit	1dfae6fe505ffedf97e9f36d207cb8bbdc9255d8	[log] [tgz]
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	Fri Jul 12 22:42:01 2019 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	Fri Jul 12 22:42:01 2019 +0000
tree	2d950113da5ca558c53c071b4c41b2e46f2e09f4
parent	b1bff76e22bd39eb46dcae49891fda1cf1cc0bd5 [diff]