AMDGPU: Run SIFoldOperands after PeepholeOptimizer PeepholeOptimizer cleans up redundant copies, which makes the operand folding more effective. shader-db stats: Totals: SGPRS: 34200 -> 34336 (0.40 %) VGPRS: 22118 -> 21655 (-2.09 %) Code Size: 632144 -> 633460 (0.21 %) bytes LDS: 11 -> 11 (0.00 %) blocks Scratch: 10240 -> 11264 (10.00 %) bytes per wave Max Waves: 8822 -> 8918 (1.09 %) Wait states: 0 -> 0 (0.00 %) Totals from affected shaders: SGPRS: 7704 -> 7840 (1.77 %) VGPRS: 5169 -> 4706 (-8.96 %) Code Size: 234444 -> 235760 (0.56 %) bytes LDS: 2 -> 2 (0.00 %) blocks Scratch: 0 -> 1024 (0.00 %) bytes per wave Max Waves: 1188 -> 1284 (8.08 %) Wait states: 0 -> 0 (0.00 %) Increases: SGPRS: 35 (0.01 %) VGPRS: 1 (0.00 %) Code Size: 59 (0.02 %) LDS: 0 (0.00 %) Scratch: 1 (0.00 %) Max Waves: 48 (0.02 %) Wait states: 0 (0.00 %) Decreases: SGPRS: 26 (0.01 %) VGPRS: 54 (0.02 %) Code Size: 68 (0.03 %) LDS: 0 (0.00 %) Scratch: 0 (0.00 %) Max Waves: 4 (0.00 %) Wait states: 0 (0.00 %) llvm-svn: 266378

commit: 3d1c1deb04efe0022f11ebc18bb43d7341ba0c75 [log] [tgz]
author: Matt Arsenault <Matthew.Arsenault@amd.com> Thu Apr 14 21:58:24 2016 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> Thu Apr 14 21:58:24 2016 +0000
tree: 3a14403716facae35ba4c41889a588710093fb5c
parent: 4ac341c8b31ab34c7cb90eda91a91e78a11a8baf [diff] [blame]
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index be0670d..d76a839 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll

@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.fma.f32(float, float, float) #1
@@ -107,7 +107,7 @@
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
@@ -227,7 +227,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
 
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS1]], [[SGPR0]], [[VK0]]
 ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
 
@@ -254,7 +254,7 @@
 
 ; Same zero component is re-used for half of each immediate.
 ; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
-; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
+; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
commit	3d1c1deb04efe0022f11ebc18bb43d7341ba0c75	[log] [tgz]
author	Matt Arsenault <Matthew.Arsenault@amd.com>	Thu Apr 14 21:58:24 2016 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	Thu Apr 14 21:58:24 2016 +0000
tree	3a14403716facae35ba4c41889a588710093fb5c
parent	4ac341c8b31ab34c7cb90eda91a91e78a11a8baf [diff] [blame]