AMDGPU: Split SILowerControlFlow into two pieces Do most of the lowering in a pre-RA pass. Keep the skip jump insertion late, plus a few other things that require more work to move out. One concern I have is now there may be COPY instructions which do not have the necessary implicit exec uses if they will be lowered to v_mov_b32. This has a positive effect on SGPR usage in shader-db. llvm-svn: 279464

commit: 78fc9daf8d1825d32f170f8e60f9158550f93e93 [log] [tgz]
author: Matt Arsenault <Matthew.Arsenault@amd.com> Mon Aug 22 19:33:16 2016 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> Mon Aug 22 19:33:16 2016 +0000
tree: 0fabfdfbe326321516366f98a99cf7de6c0b2e38
parent: 88d7da01ca7af18ed6bd446d388999bf9668a3cf [diff]
diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll
index bb885ac..ef1e647 100644
--- a/llvm/test/CodeGen/AMDGPU/else.ll
+++ b/llvm/test/CodeGen/AMDGPU/else.ll

@@ -25,11 +25,13 @@
 }
 
 ; CHECK-LABEL: {{^}}else_execfix_leave_wqm:
+; CHECK: ; BB#0:
+; CHECK-NEXT: s_mov_b64 [[INIT_EXEC:s\[[0-9]+:[0-9]+\]]], exec
 ; CHECK: ; %Flow
 ; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
-; CHECK-NEXT: s_and_b64 exec, exec,
-; CHECK-NEXT: s_and_b64 [[DST]], exec, [[DST]]
-; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
+; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
+; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
+; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
 ; CHECK-NEXT: ; mask branch
 define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) {
 main_body:

diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 35e06fa..c1f8d59 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll

@@ -2,11 +2,33 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
-; SI-LABEL: @test_if
+; SI-LABEL: {{^}}test_if:
 ; Make sure the i1 values created by the cfg structurizer pass are
 ; moved using VALU instructions
+
+
+; waitcnt should be inserted after exec modification
+; SI: v_cmp_lt_i32_e32 vcc, 0,
+; SI-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
+; SI-NEXT: s_xor_b64 [[SAVE]], exec, [[SAVE]]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
+; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
+
+; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
+; SI: s_and_saveexec_b64
+; SI-NEXT: s_xor_b64
+; SI-NEXT: ; mask branch
+
+; v_mov should be after exec modification
+; SI: [[FLOW_BB]]:
+; SI-NEXT: s_or_saveexec_b64 [[SAVE]], [[SAVE]]
+; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
+; SI-NEXT: s_xor_b64 exec, exec, [[SAVE]]
+; SI-NEXT: ; mask branch
+;
 define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -17,12 +39,12 @@
 
 case0:
   %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
-  store i32 0, i32 addrspace(1)* %arrayidx1, align 4
+  store i32 13, i32 addrspace(1)* %arrayidx1, align 4
   br label %end
 
 case1:
   %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
-  store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+  store i32 17, i32 addrspace(1)* %arrayidx5, align 4
   br label %end
 
 default:
@@ -31,11 +53,11 @@
   br i1 %cmp8, label %if, label %else
 
 if:
-  store i32 2, i32 addrspace(1)* %arrayidx10, align 4
+  store i32 19, i32 addrspace(1)* %arrayidx10, align 4
   br label %end
 
 else:
-  store i32 3, i32 addrspace(1)* %arrayidx10, align 4
+  store i32 21, i32 addrspace(1)* %arrayidx10, align 4
   br label %end
 
 end:
@@ -139,10 +161,11 @@
 ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
 
 ; SI: [[LABEL_FLOW]]:
-; SI: s_or_b64 exec, exec, [[ORNEG2]]
-; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
-; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
-; SI: s_cbranch_execnz [[LABEL_LOOP]]
+; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
+; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
+; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
 
 ; SI: BB#5
 ; SI: s_or_b64 exec, exec, [[COND_STATE]]
commit	78fc9daf8d1825d32f170f8e60f9158550f93e93	[log] [tgz]
author	Matt Arsenault <Matthew.Arsenault@amd.com>	Mon Aug 22 19:33:16 2016 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	Mon Aug 22 19:33:16 2016 +0000
tree	0fabfdfbe326321516366f98a99cf7de6c0b2e38
parent	88d7da01ca7af18ed6bd446d388999bf9668a3cf [diff]