DAGCombiner: Combine extract_vector_elt from build_vector This basic combine was surprisingly missing. AMDGPU legalizes many operations in terms of 32-bit vector components, so not doing this results in many extra copies and subregister extracts that need to be cleaned up later. InstCombine already does this for the hasOneUse case. The target hook is to fix a handful of tests which break (e.g. ARM/vmov.ll) which turn from a vector materialize repeated immediate instruction to a constant vector load with more scalar copies from it. llvm-svn: 250129

commit: 61dc235f20fa7822478b64554473cf917f67e11e [log] [tgz]
author: Matt Arsenault <Matthew.Arsenault@amd.com> Mon Oct 12 23:59:50 2015 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> Mon Oct 12 23:59:50 2015 +0000
tree: b5fc7efe09b03ecb8d24766bedc2acbc5dd72d59
parent: 82c00bee3cb011d0cb0fe3f70e3b1549f1d43086 [diff]
diff --git a/llvm/test/CodeGen/AArch64/fold-constants.ll b/llvm/test/CodeGen/AArch64/fold-constants.ll
index 2dd0d12..3f70f0a 100644
--- a/llvm/test/CodeGen/AArch64/fold-constants.ll
+++ b/llvm/test/CodeGen/AArch64/fold-constants.ll

@@ -3,9 +3,6 @@
 define i64 @dotests_616() {
 ; CHECK-LABEL: dotests_616
 ; CHECK:       movi d0, #0000000000000000
-; CHECK-NEXT:  umov w8, v0.b[2]
-; CHECK-NEXT:  sbfx w8, w8, #0, #1
-; CHECK-NEXT:  fmov s0, w8
 ; CHECK-NEXT:  fmov x0, d0
 ; CHECK-NEXT:  ret
 entry:

diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index ec04f8b..579f989 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll

@@ -216,10 +216,8 @@
   ret void
 }
 
-; We should be able to merge in this case, but probably not worth the effort.
-; SI-NOT: ds_read2_b32
-; SI: ds_read_b32
-; SI: ds_read_b32
+; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
+; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
 ; SI: s_endpgm
 define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1

diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index e8c34f0..c8ef5b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll

@@ -17,12 +17,12 @@
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI-DAG: cmp_gt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
+; SI-DAG: cmp_lt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
 ; SI-DAG: v_cmp_lt_f64
 ; SI-DAG: v_cmp_lg_f64
 ; SI: s_and_b64

diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 6618d8b..83a8ad8 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll

@@ -29,12 +29,12 @@
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI-DAG: cmp_gt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
+; SI-DAG: cmp_lt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
 ; SI: s_endpgm
 define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone

diff --git a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
index 471b0f6..f5ab390 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll

@@ -11,24 +11,35 @@
   ret void
 }
 
-define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
 ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
 ; SI, which is why it is being OR'd with the base pointer.
 ; SI: s_or_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
+define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
   %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
 
-define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
 ; CHECK-LABEL: {{^}}gep_as_vector_v4:
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-; CHECK: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CHECK: s_endpgm
+define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -41,10 +52,15 @@
   ret void
 }
 
-define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
 ; CHECK-LABEL: {{^}}gep_as_vector_v2:
-; CHECK: s_add_i32
-; CHECK: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CHECK: s_endpgm
+define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
commit	61dc235f20fa7822478b64554473cf917f67e11e	[log] [tgz]
author	Matt Arsenault <Matthew.Arsenault@amd.com>	Mon Oct 12 23:59:50 2015 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	Mon Oct 12 23:59:50 2015 +0000
tree	b5fc7efe09b03ecb8d24766bedc2acbc5dd72d59
parent	82c00bee3cb011d0cb0fe3f70e3b1549f1d43086 [diff]