Revert [MBP] Disable aggressive loop rotate in plain mode
This reverts r369664 (git commit 51f48295cbe8fa3a44db263b528dd9f7bae7bf9a)
It causes many benchmark regressions, internally and in llvm's benchmark suite.
llvm-svn: 370398
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index a425bcf..6a8456d 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -230,6 +230,11 @@
; Make sure scc liveness is updated if sor_b64 is removed
; ALL-LABEL: {{^}}scc_liveness:
+; GCN: %bb10
+; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
+; GCN: s_andn2_b64
+; GCN-NEXT: s_cbranch_execz
+
; GCN: [[BB1_LOOP:BB[0-9]+_[0-9]+]]:
; GCN: s_andn2_b64 exec, exec,
; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
@@ -240,10 +245,6 @@
; GCN-NOT: s_or_b64 exec, exec
; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
-; GCN: s_andn2_b64
-; GCN-NEXT: s_cbranch_execnz
-
-; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index 2060ea4..c903a04 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -20,29 +20,13 @@
; CHECK-NEXT: ; implicit-def: $sgpr8_sgpr9
; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7
; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3
-; CHECK-NEXT: BB0_1: ; %loop
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 32, v1
-; CHECK-NEXT: s_and_b64 vcc, exec, vcc
-; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec
-; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], exec
-; CHECK-NEXT: s_cbranch_vccz BB0_5
-; CHECK-NEXT: ; %bb.2: ; %endif1
-; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_mov_b64 s[6:7], -1
-; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[0:1]
-; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; CHECK-NEXT: ; mask branch BB0_4
-; CHECK-NEXT: BB0_3: ; %endif2
-; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: v_add_u32_e32 v1, 1, v1
-; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1
-; CHECK-NEXT: BB0_4: ; %Flow1
-; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: s_branch BB0_3
+; CHECK-NEXT: BB0_1: ; %Flow1
+; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b64 exec, exec, s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: BB0_5: ; %Flow
-; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: BB0_2: ; %Flow
+; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_and_b64 s[10:11], exec, s[6:7]
; CHECK-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5]
; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
@@ -50,8 +34,27 @@
; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; CHECK-NEXT: s_mov_b64 s[4:5], s[10:11]
; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; CHECK-NEXT: s_cbranch_execnz BB0_1
-; CHECK-NEXT: ; %bb.6: ; %Flow2
+; CHECK-NEXT: s_cbranch_execz BB0_6
+; CHECK-NEXT: BB0_3: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 32, v1
+; CHECK-NEXT: s_and_b64 vcc, exec, vcc
+; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec
+; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], exec
+; CHECK-NEXT: s_cbranch_vccz BB0_2
+; CHECK-NEXT: ; %bb.4: ; %endif1
+; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: s_mov_b64 s[6:7], -1
+; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[0:1]
+; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; CHECK-NEXT: ; mask branch BB0_1
+; CHECK-NEXT: s_cbranch_execz BB0_1
+; CHECK-NEXT: BB0_5: ; %endif2
+; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1
+; CHECK-NEXT: s_branch BB0_1
+; CHECK-NEXT: BB0_6: ; %Flow2
; CHECK-NEXT: s_or_b64 exec, exec, s[10:11]
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
@@ -62,6 +65,7 @@
; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: exp mrt0 v1, v1, v1, v1 done vm
; CHECK-NEXT: s_endpgm
+; this is the divergent branch with the condition not marked as divergent
start:
%v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0)
br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
index be6e3fd..1a675ce 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
@@ -1,16 +1,5 @@
; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s
-; CHECK-LABEL: %bb11
-
-; Load from %arg in a Loop body has alias store
-
-; CHECK: flat_load_dword
-
-; CHECK-LABEL: %bb20
-; CHECK: flat_store_dword
-
-; #####################################################################
-
; CHECK-LABEL: %bb22
; Load from %arg has alias store in Loop
@@ -23,6 +12,18 @@
; CHECK: s_load_dword
+; #####################################################################
+
+; CHECK-LABEL: %bb11
+
+; Load from %arg in a Loop body has alias store
+
+; CHECK: flat_load_dword
+
+; CHECK-LABEL: %bb20
+
+; CHECK: flat_store_dword
+
define amdgpu_kernel void @cfg(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
bb:
%tmp = sext i32 %arg2 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index ae78a1e..a050bfe 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -3,20 +3,20 @@
; SI-LABEL: {{^}}i1_copy_from_loop:
;
+; SI: ; %Flow
+; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
+; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], exec
+; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
+
; SI: ; %for.body
; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
-; SI-DAG: s_andn2_b64 [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI-DAG: s_andn2_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec
; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
; SI: ; %Flow1
; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec
-; SI: ; %Flow
-; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
-; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
-; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
-
; SI: ; %for.end
; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index dc7f495..bde1cd5 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -630,12 +630,7 @@
; GCN-LABEL: {{^}}broken_phi_bb:
; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
-; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]]
-
-; GCN: {{^BB[0-9]+_[0-9]+}}:
-; GCN: s_mov_b64 exec,
-
-; GCN: [[BB2]]:
+; GCN: [[BB2:BB[0-9]+_[0-9]+]]:
; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
; GCN: buffer_load_dword
@@ -647,6 +642,11 @@
; IDXMODE: s_set_gpr_idx_off
; GCN: s_cbranch_execnz [[REGLOOP]]
+
+; GCN: {{^; %bb.[0-9]}}:
+; GCN: s_mov_b64 exec,
+; GCN: s_branch [[BB2]]
+
define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
bb:
br label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index 486364a..f374276 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -61,9 +61,9 @@
; GCN-LABEL: {{^}}break_cond_is_arg:
; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}}
+; GCN: s_andn2_b64 exec, exec, [[REG3:[^ ,]*]]
; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]]
-; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]],
-; GCN: s_andn2_b64 exec, exec, [[REG3]]
+; GCN: s_or_b64 [[REG3]], [[REG2]],
define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index d243233..0ae28c6 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -24,13 +24,29 @@
; GCN: ; %main_body
; GCN: s_mov_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN: [[FLOW2:BB[0-9]+_[0-9]+]]: ; %Flow2
+; GCN: s_or_b64 exec, exec, [[TMP0:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
+; GCN: s_mov_b64 [[LEFT_OUTER]], [[TMP1]]
+; GCN: s_andn2_b64 exec, exec, [[TMP1]]
+; GCN: s_cbranch_execz [[IF_BLOCK:BB[0-9]+_[0-9]+]]
+
; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
; GCN: s_mov_b64 [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN: ; %Flow
+; GCN: s_or_b64 exec, exec, [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 [[TMP0]], exec, [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 [[TMP0]], [[TMP0]], [[LEFT_INNER]]
+; GCN: s_mov_b64 [[LEFT_INNER]], [[TMP0]]
+; GCN: s_andn2_b64 exec, exec, [[TMP0]]
+; GCN: s_cbranch_execz [[FLOW2]]
+
; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
-; GCN: s_or_b64 [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
-; GCN: s_or_b64 [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
-; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN: s_or_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], exec
+; GCN: s_or_b64 [[BREAK_INNER]], [[BREAK_INNER]], exec
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC]], vcc
; FIXME: duplicate comparison
; GCN: ; %ENDIF
@@ -43,23 +59,7 @@
; GCN-DAG: s_or_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
; GCN-DAG: s_or_b64 [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]
-; GCN: ; %Flow
-; GCN: s_or_b64 exec, exec, [[SAVE_EXEC]]
-; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
-; GCN: s_or_b64 [[TMP0]], [[TMP0]], [[LEFT_INNER]]
-; GCN: s_mov_b64 [[LEFT_INNER]], [[TMP0]]
-; GCN: s_andn2_b64 exec, exec, [[TMP0]]
-; GCN: s_cbranch_execnz [[INNER_LOOP]]
-
-; GCN: ; %Flow2
-; GCN: s_or_b64 exec, exec, [[TMP0]]
-; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
-; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
-; GCN: s_mov_b64 [[LEFT_OUTER]], [[TMP1]]
-; GCN: s_andn2_b64 exec, exec, [[TMP1]]
-; GCN: s_cbranch_execnz [[OUTER_LOOP]]
-
-; GCN: ; %IF
+; GCN: [[IF_BLOCK]]: ; %IF
; GCN-NEXT: s_endpgm
define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
main_body:
@@ -92,12 +92,18 @@
; GCN-LABEL: {{^}}multi_if_break_loop:
; GCN: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN: ; %Flow4
+; GCN: s_and_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK]]
+; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_andn2_b64 exec, exec, [[LEFT]]
+; GCN-NEXT: s_cbranch_execz
+
; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
-; GCN: s_mov_b64 [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+; GCN: s_mov_b64 [[OLD_LEFT]], [[LEFT]]
; GCN: ; %LeafBlock1
; GCN: s_mov_b64
-; GCN: s_mov_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN: s_mov_b64 [[BREAK]], -1{{$}}
; GCN: ; %case1
; GCN: buffer_load_dword [[LOAD2:v[0-9]+]],
@@ -118,12 +124,6 @@
; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]]
-; GCN: ; %Flow4
-; GCN: s_and_b64 [[BREAK]], exec, [[BREAK]]
-; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT]]
-; GCN: s_andn2_b64 exec, exec, [[LEFT]]
-; GCN-NEXT: s_cbranch_execnz
-
define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index be5d8d4..2be9926 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -3,11 +3,11 @@
; GCN-LABEL: {{^}}negated_cond:
; GCN: BB0_1:
; GCN: v_cmp_eq_u32_e64 [[CC:[^,]+]],
-; GCN: BB0_2:
+; GCN: BB0_3:
; GCN-NOT: v_cndmask_b32
; GCN-NOT: v_cmp
; GCN: s_andn2_b64 vcc, exec, [[CC]]
-; GCN: s_cbranch_vccnz BB0_4
+; GCN: s_cbranch_vccnz BB0_2
define amdgpu_kernel void @negated_cond(i32 addrspace(1)* %arg1) {
bb:
br label %bb1
@@ -36,11 +36,11 @@
; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
; GCN: v_cmp_eq_u32_e64 [[CC:[^,]+]],
-; GCN: BB1_1:
+; GCN: %bb4
; GCN-NOT: v_cndmask_b32
; GCN-NOT: v_cmp
; GCN: s_andn2_b64 vcc, exec, [[CC]]
-; GCN: s_cbranch_vccz BB1_3
+; GCN: s_cbranch_vccnz BB1_1
define amdgpu_kernel void @negated_cond_dominated_blocks(i32 addrspace(1)* %arg1) {
bb:
br label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index ad68d30..1492874 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -96,20 +96,20 @@
; FUNC-LABEL: {{^}}loop_land_info_assert:
; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
; SI: s_and_b64 [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
-; SI: s_branch [[INFLOOP:BB[0-9]+_[0-9]+]]
+
+; SI: [[WHILELOOP:BB[0-9]+_[0-9]+]]: ; %while.cond
+; SI: s_cbranch_vccz [[FOR_COND_PH:BB[0-9]+_[0-9]+]]
; SI: [[CONVEX_EXIT:BB[0-9_]+]]
; SI: s_mov_b64 vcc,
; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
-; SI: s_cbranch_vccnz [[INFLOOP]]
+
+; SI: s_cbranch_vccnz [[WHILELOOP]]
; SI: ; %if.else
; SI: buffer_store_dword
-; SI: [[INFLOOP]]:
-; SI: s_cbranch_vccnz [[CONVEX_EXIT]]
-
-; SI: ; %for.cond.preheader
+; SI: [[FOR_COND_PH]]: ; %for.cond.preheader
; SI: s_cbranch_vccz [[ENDPGM]]
; SI: [[ENDPGM]]:
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index e1da3cf..a71ca5d 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -166,30 +166,29 @@
}
; GCN-LABEL: {{^}}test_loop_with_if:
-; GCN: BB{{.*}}: ; %bb2
+; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
+; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}}
+; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
+; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN: s_cbranch_execz
+; GCN: BB{{.*}}:
; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
; GCN: s_cbranch_execz
-; GCN: BB{{.*}}: ; %bb5
-; GCN: BB{{.*}}: ; %Flow
+; GCN: BB{{.*}}:
+; GCN: BB{{.*}}:
; GFX1032: s_xor_b32 s{{[0-9]+}}, exec_lo, s{{[0-9]+}}
; GFX1064: s_xor_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}]
; GCN: ; mask branch BB
-; GCN: BB{{.*}}: ; %bb11
-; GCN: BB{{.*}}: ; %Flow1
+; GCN: BB{{.*}}:
+; GCN: BB{{.*}}:
; GFX1032: s_or_b32 exec_lo, exec_lo, s{{[0-9]+}}
; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, s{{[0-9]+}}
; GFX1064: s_or_b64 exec, exec, s[{{[0-9:]+}}]
; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
; GCN: ; mask branch BB
-; GCN: BB{{.*}}: ; %bb10
-; GCN: BB{{.*}}: ; %bb13
-; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
-; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}}
-; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
-; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}]
-; GCN: s_cbranch_execnz
-; GCN: ; %bb1
+; GCN: BB{{.*}}:
+; GCN: BB{{.*}}:
; GCN: s_endpgm
define amdgpu_kernel void @test_loop_with_if(i32 addrspace(1)* %arg) #0 {
bb:
@@ -231,17 +230,16 @@
; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
; GCN: ; mask branch
; GCN: s_cbranch_execz
-; GCN: BB{{.*}}: ; %.preheader
-; GCN: ; %bb8
+; GCN: BB{{.*}}:
+; GCN: BB{{.*}}:
; GFX1032: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, exec_lo
; GFX1064: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], exec
; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
-; GCN: BB{{.*}}: ; %Flow
-; GCN: s_cbranch_execnz
-; GCN: BB{{.*}}: ; %.loopexit
+; GCN: s_cbranch_execz
+; GCN: BB{{.*}}:
define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -657,7 +655,7 @@
; GCN-LABEL: {{^}}test_loop_vcc:
; GFX1032: v_cmp_lt_f32_e32 vcc_lo,
; GFX1064: v_cmp_lt_f32_e32 vcc,
-; GCN: s_cbranch_vccz
+; GCN: s_cbranch_vccnz
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
entry:
br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 41ff30b..b827668 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -650,12 +650,15 @@
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
-; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
-; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
+; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %loop
; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
-; CHECK: s_cbranch_vccz [[LOOPHDR]]
-; CHECK: ; %break
+; CHECK: s_cbranch_vccnz
+; CHECK: ; %body
+; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
+; CHECK: s_branch [[LOOPHDR]]
+
+; CHECK: ; %break
; CHECK: ; return
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
entry: