AMDGPU: Merge BUFFER_LOAD_DWORD_OFFEN into x2, x4

Summary:
-9.9% code size decrease in affected shaders.

Totals (changed stats only):
SGPRS: 2151462 -> 2170646 (0.89 %)
VGPRS: 1634612 -> 1640288 (0.35 %)
Spilled SGPRs: 8942 -> 8940 (-0.02 %)
Code Size: 52940672 -> 51727288 (-2.29 %) bytes
Max Waves: 373066 -> 371718 (-0.36 %)

Totals from affected shaders:
SGPRS: 283520 -> 302704 (6.77 %)
VGPRS: 227632 -> 233308 (2.49 %)
Spilled SGPRs: 3966 -> 3964 (-0.05 %)
Code Size: 12203080 -> 10989696 (-9.94 %) bytes
Max Waves: 44070 -> 42722 (-3.06 %)

Reviewers: arsenm, nhaehnle

Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye

Differential Revision: https://reviews.llvm.org/D38950

llvm-svn: 317752
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
index 03caca8..14b9a2e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -126,8 +126,76 @@
   ret float %val
 }
 
+;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged:
+;CHECK-NEXT: BB#
+;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+main_body:
+  %a1 = add i32 %a, 4
+  %a2 = add i32 %a, 8
+  %a3 = add i32 %a, 12
+  %a4 = add i32 %a, 16
+  %a5 = add i32 %a, 28
+  %a6 = add i32 %a, 32
+  %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+  %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
+  %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
+  %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
+  %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc:
+;CHECK-NEXT: BB#
+;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
+;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
+;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
+main_body:
+  %a1 = add i32 %a, 4
+  %a2 = add i32 %a, 8
+  %a3 = add i32 %a, 12
+  %a4 = add i32 %a, 16
+  %a5 = add i32 %a, 28
+  %a6 = add i32 %a, 32
+  %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+  %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
+  %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
+  %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
+  %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged:
+;CHECK-NEXT: BB#
+;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+main_body:
+  %a1 = add i32 %a, 4
+  %a2 = add i32 %a, 12
+  %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+  %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+  %r1 = extractelement <2 x float> %vr1, i32 0
+  %r2 = extractelement <2 x float> %vr1, i32 1
+  %r3 = extractelement <2 x float> %vr2, i32 0
+  %r4 = extractelement <2 x float> %vr2, i32 1
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
+  ret void
+}
+
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
 attributes #0 = { nounwind readonly }