AMDGPU: Split SILowerControlFlow into two pieces
Do most of the lowering in a pre-RA pass. Keep the skip jump
insertion late, plus a few other things that require more
work to move out.
One concern I have is now there may be COPY instructions
which do not have the necessary implicit exec uses
if they will be lowered to v_mov_b32.
This has a positive effect on SGPR usage in shader-db.
llvm-svn: 279464
diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll
index bb885ac..ef1e647 100644
--- a/llvm/test/CodeGen/AMDGPU/else.ll
+++ b/llvm/test/CodeGen/AMDGPU/else.ll
@@ -25,11 +25,13 @@
}
; CHECK-LABEL: {{^}}else_execfix_leave_wqm:
+; CHECK: ; BB#0:
+; CHECK-NEXT: s_mov_b64 [[INIT_EXEC:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
-; CHECK-NEXT: s_and_b64 exec, exec,
-; CHECK-NEXT: s_and_b64 [[DST]], exec, [[DST]]
-; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
+; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
+; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
+; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
; CHECK-NEXT: ; mask branch
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) {
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 35e06fa..c1f8d59 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -2,11 +2,33 @@
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-; SI-LABEL: @test_if
+; SI-LABEL: {{^}}test_if:
; Make sure the i1 values created by the cfg structurizer pass are
; moved using VALU instructions
+
+
+; waitcnt should be inserted after exec modification
+; SI: v_cmp_lt_i32_e32 vcc, 0,
+; SI-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
+; SI-NEXT: s_xor_b64 [[SAVE]], exec, [[SAVE]]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
+; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
+
+; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
; SI: v_mov_b32_e32 v{{[0-9]}}, -1
+; SI: s_and_saveexec_b64
+; SI-NEXT: s_xor_b64
+; SI-NEXT: ; mask branch
+
+; v_mov should be after exec modification
+; SI: [[FLOW_BB]]:
+; SI-NEXT: s_or_saveexec_b64 [[SAVE]], [[SAVE]]
+; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
+; SI-NEXT: s_xor_b64 exec, exec, [[SAVE]]
+; SI-NEXT: ; mask branch
+;
define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -17,12 +39,12 @@
case0:
%arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
- store i32 0, i32 addrspace(1)* %arrayidx1, align 4
+ store i32 13, i32 addrspace(1)* %arrayidx1, align 4
br label %end
case1:
%arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
- store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+ store i32 17, i32 addrspace(1)* %arrayidx5, align 4
br label %end
default:
@@ -31,11 +53,11 @@
br i1 %cmp8, label %if, label %else
if:
- store i32 2, i32 addrspace(1)* %arrayidx10, align 4
+ store i32 19, i32 addrspace(1)* %arrayidx10, align 4
br label %end
else:
- store i32 3, i32 addrspace(1)* %arrayidx10, align 4
+ store i32 21, i32 addrspace(1)* %arrayidx10, align 4
br label %end
end:
@@ -139,10 +161,11 @@
; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
; SI: [[LABEL_FLOW]]:
-; SI: s_or_b64 exec, exec, [[ORNEG2]]
-; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
-; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
-; SI: s_cbranch_execnz [[LABEL_LOOP]]
+; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
+; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
+; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
; SI: BB#5
; SI: s_or_b64 exec, exec, [[COND_STATE]]