Revert [MBP] Disable aggressive loop rotate in plain mode

This reverts r369664 (git commit 51f48295cbe8fa3a44db263b528dd9f7bae7bf9a)

It causes many benchmark regressions, internally and in llvm's benchmark suite.

llvm-svn: 370398
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index a425bcf..6a8456d 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -230,6 +230,11 @@
 ; Make sure scc liveness is updated if sor_b64 is removed
 ; ALL-LABEL: {{^}}scc_liveness:
 
+; GCN: %bb10
+; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
+; GCN: s_andn2_b64
+; GCN-NEXT: s_cbranch_execz
+
 ; GCN: [[BB1_LOOP:BB[0-9]+_[0-9]+]]:
 ; GCN: s_andn2_b64 exec, exec,
 ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
@@ -240,10 +245,6 @@
 ; GCN-NOT: s_or_b64 exec, exec
 
 ; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
-; GCN: s_andn2_b64
-; GCN-NEXT: s_cbranch_execnz
-
-; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index 2060ea4..c903a04 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -20,29 +20,13 @@
 ; CHECK-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; CHECK-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; CHECK-NEXT:  BB0_1: ; %loop
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v1
-; CHECK-NEXT:    s_and_b64 vcc, exec, vcc
-; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], exec
-; CHECK-NEXT:    s_or_b64 s[8:9], s[8:9], exec
-; CHECK-NEXT:    s_cbranch_vccz BB0_5
-; CHECK-NEXT:  ; %bb.2: ; %endif1
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[6:7], -1
-; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[0:1]
-; CHECK-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; CHECK-NEXT:    ; mask branch BB0_4
-; CHECK-NEXT:  BB0_3: ; %endif2
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    v_add_u32_e32 v1, 1, v1
-; CHECK-NEXT:    s_xor_b64 s[6:7], exec, -1
-; CHECK-NEXT:  BB0_4: ; %Flow1
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    s_branch BB0_3
+; CHECK-NEXT:  BB0_1: ; %Flow1
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:  BB0_5: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:  BB0_2: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_and_b64 s[10:11], exec, s[6:7]
 ; CHECK-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
 ; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
@@ -50,8 +34,27 @@
 ; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[10:11]
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; CHECK-NEXT:    s_cbranch_execnz BB0_1
-; CHECK-NEXT:  ; %bb.6: ; %Flow2
+; CHECK-NEXT:    s_cbranch_execz BB0_6
+; CHECK-NEXT:  BB0_3: ; %loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v1
+; CHECK-NEXT:    s_and_b64 vcc, exec, vcc
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], exec
+; CHECK-NEXT:    s_or_b64 s[8:9], s[8:9], exec
+; CHECK-NEXT:    s_cbranch_vccz BB0_2
+; CHECK-NEXT:  ; %bb.4: ; %endif1
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    s_mov_b64 s[6:7], -1
+; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; CHECK-NEXT:    ; mask branch BB0_1
+; CHECK-NEXT:    s_cbranch_execz BB0_1
+; CHECK-NEXT:  BB0_5: ; %endif2
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT:    s_xor_b64 s[6:7], exec, -1
+; CHECK-NEXT:    s_branch BB0_1
+; CHECK-NEXT:  BB0_6: ; %Flow2
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
@@ -62,6 +65,7 @@
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    exp mrt0 v1, v1, v1, v1 done vm
 ; CHECK-NEXT:    s_endpgm
+; this is the divergent branch with the condition not marked as divergent
 start:
   %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0)
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
index be6e3fd..1a675ce 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
@@ -1,16 +1,5 @@
 ; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs  < %s | FileCheck %s
 
-; CHECK-LABEL: %bb11
-
-; Load from %arg in a Loop body has alias store
-
-; CHECK: flat_load_dword
-
-; CHECK-LABEL: %bb20
-; CHECK: flat_store_dword
-
-; #####################################################################
-
 ; CHECK-LABEL: %bb22
 
 ; Load from %arg has alias store in Loop
@@ -23,6 +12,18 @@
 
 ; CHECK: s_load_dword
 
+; #####################################################################
+
+; CHECK-LABEL: %bb11
+
+; Load from %arg in a Loop body has alias store
+
+; CHECK: flat_load_dword
+
+; CHECK-LABEL: %bb20
+
+; CHECK: flat_store_dword
+
 define amdgpu_kernel void @cfg(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
 bb:
   %tmp = sext i32 %arg2 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index ae78a1e..a050bfe 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -3,20 +3,20 @@
 
 ; SI-LABEL: {{^}}i1_copy_from_loop:
 ;
+; SI: ; %Flow
+; SI-DAG:  s_andn2_b64       [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], exec
+; SI:      s_or_b64          [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
+
 ; SI: ; %for.body
 ; SI:      v_cmp_gt_u32_e64  [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
-; SI-DAG:  s_andn2_b64       [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI-DAG:  s_andn2_b64       [[CC_ACCUM]], [[CC_ACCUM]], exec
 ; SI-DAG:  s_and_b64         [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
 ; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
 
 ; SI: ; %Flow1
 ; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], exec
 
-; SI: ; %Flow
-; SI-DAG:  s_andn2_b64       [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
-; SI-DAG:  s_and_b64         [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
-; SI:      s_or_b64          [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
-
 ; SI: ; %for.end
 ; SI:      s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index dc7f495..bde1cd5 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -630,12 +630,7 @@
 ; GCN-LABEL: {{^}}broken_phi_bb:
 ; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
 
-; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]]
-
-; GCN: {{^BB[0-9]+_[0-9]+}}:
-; GCN: s_mov_b64 exec,
-
-; GCN: [[BB2]]:
+; GCN: [[BB2:BB[0-9]+_[0-9]+]]:
 ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
 ; GCN: buffer_load_dword
 
@@ -647,6 +642,11 @@
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: s_cbranch_execnz [[REGLOOP]]
+
+; GCN: {{^; %bb.[0-9]}}:
+; GCN: s_mov_b64 exec,
+; GCN: s_branch [[BB2]]
+
 define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 bb:
   br label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index 486364a..f374276 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -61,9 +61,9 @@
 
 ; GCN-LABEL: {{^}}break_cond_is_arg:
 ; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}}
+; GCN: s_andn2_b64 exec, exec, [[REG3:[^ ,]*]]
 ; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]]
-; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]],
-; GCN: s_andn2_b64 exec, exec, [[REG3]]
+; GCN: s_or_b64 [[REG3]], [[REG2]],
 
 define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index d243233..0ae28c6 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -24,13 +24,29 @@
 ; GCN: ; %main_body
 ; GCN:      s_mov_b64           [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
+; GCN: [[FLOW2:BB[0-9]+_[0-9]+]]: ; %Flow2
+; GCN:      s_or_b64            exec, exec, [[TMP0:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
+; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
+; GCN:      s_cbranch_execz    [[IF_BLOCK:BB[0-9]+_[0-9]+]]
+
 ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
 ; GCN:      s_mov_b64           [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
+; GCN: ; %Flow
+; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_and_b64           [[TMP0]], exec, [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
+; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
+; GCN:      s_cbranch_execz    [[FLOW2]]
+
 ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
-; GCN:      s_or_b64            [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
-; GCN:      s_or_b64            [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
-; GCN:      s_and_saveexec_b64  [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN:      s_or_b64            [[BREAK_OUTER]], [[BREAK_OUTER]], exec
+; GCN:      s_or_b64            [[BREAK_INNER]], [[BREAK_INNER]], exec
+; GCN:      s_and_saveexec_b64  [[SAVE_EXEC]], vcc
 
 ; FIXME: duplicate comparison
 ; GCN: ; %ENDIF
@@ -43,23 +59,7 @@
 ; GCN-DAG:  s_or_b64            [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
 ; GCN-DAG:  s_or_b64            [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]
 
-; GCN: ; %Flow
-; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC]]
-; GCN:      s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
-; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
-; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
-; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
-; GCN:      s_cbranch_execnz    [[INNER_LOOP]]
-
-; GCN: ; %Flow2
-; GCN:      s_or_b64            exec, exec, [[TMP0]]
-; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
-; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
-; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
-; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
-; GCN:      s_cbranch_execnz    [[OUTER_LOOP]]
-
-; GCN: ; %IF
+; GCN: [[IF_BLOCK]]: ; %IF
 ; GCN-NEXT: s_endpgm
 define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 main_body:
@@ -92,12 +92,18 @@
 ; GCN-LABEL: {{^}}multi_if_break_loop:
 ; GCN:      s_mov_b64          [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
+; GCN: ; %Flow4
+; GCN:      s_and_b64          [[BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK]]
+; GCN:      s_or_b64           [[LEFT]], [[BREAK]], [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_andn2_b64        exec, exec, [[LEFT]]
+; GCN-NEXT: s_cbranch_execz
+
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
-; GCN:      s_mov_b64          [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+; GCN:      s_mov_b64          [[OLD_LEFT]], [[LEFT]]
 
 ; GCN: ; %LeafBlock1
 ; GCN:      s_mov_b64
-; GCN:      s_mov_b64          [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN:      s_mov_b64          [[BREAK]], -1{{$}}
 
 ; GCN: ; %case1
 ; GCN:      buffer_load_dword  [[LOAD2:v[0-9]+]],
@@ -118,12 +124,6 @@
 ; GCN-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
 ; GCN:      s_or_b64           [[BREAK]], [[BREAK]], [[TMP]]
 
-; GCN: ; %Flow4
-; GCN:      s_and_b64          [[BREAK]], exec, [[BREAK]]
-; GCN:      s_or_b64           [[LEFT]], [[BREAK]], [[OLD_LEFT]]
-; GCN:      s_andn2_b64        exec, exec, [[LEFT]]
-; GCN-NEXT: s_cbranch_execnz
-
 define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index be5d8d4..2be9926 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -3,11 +3,11 @@
 ; GCN-LABEL: {{^}}negated_cond:
 ; GCN: BB0_1:
 ; GCN:   v_cmp_eq_u32_e64 [[CC:[^,]+]],
-; GCN: BB0_2:
+; GCN: BB0_3:
 ; GCN-NOT: v_cndmask_b32
 ; GCN-NOT: v_cmp
 ; GCN:   s_andn2_b64 vcc, exec, [[CC]]
-; GCN:   s_cbranch_vccnz BB0_4
+; GCN:   s_cbranch_vccnz BB0_2
 define amdgpu_kernel void @negated_cond(i32 addrspace(1)* %arg1) {
 bb:
   br label %bb1
@@ -36,11 +36,11 @@
 
 ; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
 ; GCN:   v_cmp_eq_u32_e64 [[CC:[^,]+]],
-; GCN: BB1_1:
+; GCN: %bb4
 ; GCN-NOT: v_cndmask_b32
 ; GCN-NOT: v_cmp
 ; GCN:   s_andn2_b64 vcc, exec, [[CC]]
-; GCN:   s_cbranch_vccz BB1_3
+; GCN:   s_cbranch_vccnz BB1_1
 define amdgpu_kernel void @negated_cond_dominated_blocks(i32 addrspace(1)* %arg1) {
 bb:
   br label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index ad68d30..1492874 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -96,20 +96,20 @@
 ; FUNC-LABEL: {{^}}loop_land_info_assert:
 ; SI:      v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
 ; SI:      s_and_b64        [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
-; SI:      s_branch         [[INFLOOP:BB[0-9]+_[0-9]+]]
+
+; SI: [[WHILELOOP:BB[0-9]+_[0-9]+]]: ; %while.cond
+; SI:      s_cbranch_vccz [[FOR_COND_PH:BB[0-9]+_[0-9]+]]
 
 ; SI:      [[CONVEX_EXIT:BB[0-9_]+]]
 ; SI:      s_mov_b64        vcc,
 ; SI-NEXT: s_cbranch_vccnz  [[ENDPGM:BB[0-9]+_[0-9]+]]
-; SI:      s_cbranch_vccnz  [[INFLOOP]]
+
+; SI:      s_cbranch_vccnz  [[WHILELOOP]]
 
 ; SI: ; %if.else
 ; SI:      buffer_store_dword
 
-; SI:      [[INFLOOP]]:
-; SI:      s_cbranch_vccnz [[CONVEX_EXIT]]
-
-; SI: ; %for.cond.preheader
+; SI: [[FOR_COND_PH]]: ; %for.cond.preheader
 ; SI:      s_cbranch_vccz [[ENDPGM]]
 
 ; SI:      [[ENDPGM]]:
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index e1da3cf..a71ca5d 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -166,30 +166,29 @@
 }
 
 ; GCN-LABEL: {{^}}test_loop_with_if:
-; GCN:   BB{{.*}}: ; %bb2
+; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
+; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}}
+; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
+; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}]
+; GCN:     s_cbranch_execz
+; GCN:   BB{{.*}}:
 ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
 ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
 ; GCN:     s_cbranch_execz
-; GCN:   BB{{.*}}: ; %bb5
-; GCN:   BB{{.*}}: ; %Flow
+; GCN:   BB{{.*}}:
+; GCN:   BB{{.*}}:
 ; GFX1032: s_xor_b32 s{{[0-9]+}}, exec_lo, s{{[0-9]+}}
 ; GFX1064: s_xor_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}]
 ; GCN:     ; mask branch BB
-; GCN:   BB{{.*}}: ; %bb11
-; GCN:   BB{{.*}}: ; %Flow1
+; GCN:   BB{{.*}}:
+; GCN:   BB{{.*}}:
 ; GFX1032: s_or_b32 exec_lo, exec_lo, s{{[0-9]+}}
 ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX1064: s_or_b64 exec, exec, s[{{[0-9:]+}}]
 ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 ; GCN:     ; mask branch BB
-; GCN:   BB{{.*}}: ; %bb10
-; GCN:   BB{{.*}}: ; %bb13
-; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
-; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}}
-; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
-; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}]
-; GCN:     s_cbranch_execnz
-; GCN:   ; %bb1
+; GCN:   BB{{.*}}:
+; GCN:   BB{{.*}}:
 ; GCN:     s_endpgm
 define amdgpu_kernel void @test_loop_with_if(i32 addrspace(1)* %arg) #0 {
 bb:
@@ -231,17 +230,16 @@
 ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
 ; GCN:     ; mask branch
 ; GCN:     s_cbranch_execz
-; GCN:   BB{{.*}}: ; %.preheader
-; GCN:   ; %bb8
+; GCN:   BB{{.*}}:
+; GCN:   BB{{.*}}:
 ; GFX1032: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, exec_lo
 ; GFX1064: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], exec
 ; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
 ; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
 ; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
-; GCN:   BB{{.*}}: ; %Flow
-; GCN:     s_cbranch_execnz
-; GCN:   BB{{.*}}: ; %.loopexit
+; GCN:     s_cbranch_execz
+; GCN:   BB{{.*}}:
 define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -657,7 +655,7 @@
 ; GCN-LABEL: {{^}}test_loop_vcc:
 ; GFX1032: v_cmp_lt_f32_e32 vcc_lo,
 ; GFX1064: v_cmp_lt_f32_e32 vcc,
-; GCN: s_cbranch_vccz
+; GCN: s_cbranch_vccnz
 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 41ff30b..b827668 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -650,12 +650,15 @@
 ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
 ; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
 
-; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
-; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
+; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %loop
 ; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
-; CHECK: s_cbranch_vccz [[LOOPHDR]]
-; CHECK: ; %break
+; CHECK: s_cbranch_vccnz
 
+; CHECK: ; %body
+; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
+; CHECK: s_branch [[LOOPHDR]]
+
+; CHECK: ; %break
 ; CHECK: ; return
 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 entry: