[AMDGPU] Extend the SI Load/Store optimizer to combine more things. I've extended the load/store optimizer to be able to produce dwordx3 loads and stores, This change allows many more load/stores to be combined, and results in much more optimal code for our hardware. Differential Revision: https://reviews.llvm.org/D54042 llvm-svn: 348937

commit: 76504a4c5e196aac50afe65f1db55345b9a01b7e [log] [tgz]
author: Neil Henning <neil.henning@amd.com> Wed Dec 12 16:15:21 2018 +0000
committer: Neil Henning <neil.henning@amd.com> Wed Dec 12 16:15:21 2018 +0000
tree: bacefc350bd264ec596c3b27c90fe518e97d8d38
parent: ef8683abec6b4f36cef4bba2fd6a4b69f0e59f22 [diff] [blame]
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 0b28d89..d8126fa 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

@@ -36,10 +36,10 @@
 ; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
 ; GCN-NOT: v_cvt_f32_ubyte3_e32
-; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
+; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
+; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
commit	76504a4c5e196aac50afe65f1db55345b9a01b7e	[log] [tgz]
author	Neil Henning <neil.henning@amd.com>	Wed Dec 12 16:15:21 2018 +0000
committer	Neil Henning <neil.henning@amd.com>	Wed Dec 12 16:15:21 2018 +0000
tree	bacefc350bd264ec596c3b27c90fe518e97d8d38
parent	ef8683abec6b4f36cef4bba2fd6a4b69f0e59f22 [diff] [blame]