StructurizeCFG: Fix broken backedge detection

The work order was changed in r228186 from SCC order
to RPO with an arbitrary sorting function. The sorting
function attempted to move inner loop nodes earlier. This
was was apparently relying on an assumption that every block
in a given loop / the same loop depth would be seen before
visiting another loop. In the broken testcase, a block
outside of the loop was encountered before moving onto
another block in the same loop. The testcase would then
structurize such that one blocks unconditional successor
could never be reached.

Revert to plain RPO for the analysis phase. This fixes
detecting edges as backedges that aren't really.

The processing phase does use another visited set, and
I'm unclear on whether the order there is as important.
An arbitrary order doesn't work, and triggers some infinite
loops. The reversed RPO list seems to work and is closer
to the order that was used before, minus the arbitary
custom sorting.

A few of the changed tests now produce smaller code,
and a few are slightly worse looking.

llvm-svn: 321751
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index b8fb80b..525425b 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
@@ -177,9 +176,8 @@
   Region *ParentRegion;
 
   DominatorTree *DT;
-  LoopInfo *LI;
 
-  SmallVector<RegionNode *, 8> Order;
+  std::deque<RegionNode *> Order;
   BBSet Visited;
 
   BBPhiMap DeletedPhis;
@@ -204,7 +202,7 @@
 
   void gatherPredicates(RegionNode *N);
 
-  void collectInfos();
+  void analyzeNode(RegionNode *N);
 
   void insertConditions(bool Loops);
 
@@ -258,7 +256,6 @@
       AU.addRequired<DivergenceAnalysis>();
     AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
@@ -292,55 +289,17 @@
 
 /// \brief Build up the general order of nodes
 void StructurizeCFG::orderNodes() {
-  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
-  SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
+  assert(Visited.empty());
+  assert(Predicates.empty());
+  assert(Loops.empty());
+  assert(LoopPreds.empty());
 
-  // The reverse post-order traversal of the list gives us an ordering close
-  // to what we want.  The only problem with it is that sometimes backedges
-  // for outer loops will be visited before backedges for inner loops.
-  for (RegionNode *RN : RPOT) {
-    BasicBlock *BB = RN->getEntry();
-    Loop *Loop = LI->getLoopFor(BB);
-    ++LoopBlocks[Loop];
+  // This must be RPO order for the back edge detection to work
+  for (RegionNode *RN : ReversePostOrderTraversal<Region*>(ParentRegion)) {
+    // FIXME: Is there a better order to use for structurization?
+    Order.push_back(RN);
+    analyzeNode(RN);
   }
-
-  unsigned CurrentLoopDepth = 0;
-  Loop *CurrentLoop = nullptr;
-  for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    BasicBlock *BB = (*I)->getEntry();
-    unsigned LoopDepth = LI->getLoopDepth(BB);
-
-    if (is_contained(Order, *I))
-      continue;
-
-    if (LoopDepth < CurrentLoopDepth) {
-      // Make sure we have visited all blocks in this loop before moving back to
-      // the outer loop.
-
-      auto LoopI = I;
-      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
-        LoopI++;
-        BasicBlock *LoopBB = (*LoopI)->getEntry();
-        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
-          --BlockCount;
-          Order.push_back(*LoopI);
-        }
-      }
-    }
-
-    CurrentLoop = LI->getLoopFor(BB);
-    if (CurrentLoop)
-      LoopBlocks[CurrentLoop]--;
-
-    CurrentLoopDepth = LoopDepth;
-    Order.push_back(*I);
-  }
-
-  // This pass originally used a post-order traversal and then operated on
-  // the list in reverse. Now that we are using a reverse post-order traversal
-  // rather than re-working the whole pass to operate on the list in order,
-  // we just reverse the list and continue to operate on it in reverse.
-  std::reverse(Order.begin(), Order.end());
 }
 
 /// \brief Determine the end of the loops
@@ -466,32 +425,19 @@
 }
 
 /// \brief Collect various loop and predicate infos
-void StructurizeCFG::collectInfos() {
-  // Reset predicate
-  Predicates.clear();
+void StructurizeCFG::analyzeNode(RegionNode *RN) {
+  DEBUG(dbgs() << "Visiting: "
+        << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+        << RN->getEntry()->getName() << '\n');
 
-  // and loop infos
-  Loops.clear();
-  LoopPreds.clear();
+  // Analyze all the conditions leading to a node
+  gatherPredicates(RN);
 
-  // Reset the visited nodes
-  Visited.clear();
+  // Remember that we've seen this node
+  Visited.insert(RN->getEntry());
 
-  for (RegionNode *RN : reverse(Order)) {
-    DEBUG(dbgs() << "Visiting: "
-                 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
-                 << RN->getEntry()->getName() << " Loop Depth: "
-                 << LI->getLoopDepth(RN->getEntry()) << "\n");
-
-    // Analyze all the conditions leading to a node
-    gatherPredicates(RN);
-
-    // Remember that we've seen this node
-    Visited.insert(RN->getEntry());
-
-    // Find the last back edges
-    analyzeLoops(RN);
-  }
+  // Find the last back edges
+  analyzeLoops(RN);
 }
 
 /// \brief Insert the missing branch conditions
@@ -664,7 +610,7 @@
 BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
   LLVMContext &Context = Func->getContext();
   BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
-                       Order.back()->getEntry();
+                       Order.front()->getEntry();
   BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
                                         Func, Insert);
   DT->addNewBlock(Flow, Dominator);
@@ -744,7 +690,8 @@
 /// Take one node from the order vector and wire it up
 void StructurizeCFG::wireFlow(bool ExitUseAllowed,
                               BasicBlock *LoopEnd) {
-  RegionNode *Node = Order.pop_back_val();
+  RegionNode *Node = Order.front();
+  Order.pop_front();
   Visited.insert(Node->getEntry());
 
   if (isPredictableTrue(Node)) {
@@ -768,7 +715,7 @@
 
     PrevNode = Node;
     while (!Order.empty() && !Visited.count(LoopEnd) &&
-           dominatesPredicates(Entry, Order.back())) {
+           dominatesPredicates(Entry, Order.front())) {
       handleLoops(false, LoopEnd);
     }
 
@@ -779,7 +726,7 @@
 
 void StructurizeCFG::handleLoops(bool ExitUseAllowed,
                                  BasicBlock *LoopEnd) {
-  RegionNode *Node = Order.back();
+  RegionNode *Node = Order.front();
   BasicBlock *LoopStart = Node->getEntry();
 
   if (!Loops.count(LoopStart)) {
@@ -924,10 +871,9 @@
   ParentRegion = R;
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
   orderNodes();
-  collectInfos();
+
   createFlow();
   insertConditions(false);
   insertConditions(true);
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 8cc02d4..5b556f1 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -66,9 +66,10 @@
 
 ; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
 ; OPT: llvm.amdgcn.break
+; OPT: llvm.amdgcn.break
+; OPT: llvm.amdgcn.if.break
+; OPT: llvm.amdgcn.if.break
 ; OPT: llvm.amdgcn.loop
-; OPT: llvm.amdgcn.if.break
-; OPT: llvm.amdgcn.if.break
 ; OPT: llvm.amdgcn.end.cf
 
 ; GCN-LABEL: {{^}}multi_if_break_loop:
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 672549c..96d2841 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -124,55 +124,100 @@
 ; Earlier version of above, before a run of the structurizer.
 ; IR-LABEL: @nested_loop_conditions(
 
-; IR: Flow7:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
-; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
-; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
-; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
+; IR: %tmp1235 = icmp slt i32 %tmp1134, 9
+; IR:   br i1 %tmp1235, label %bb14.lr.ph, label %Flow
 
-; IR: Flow1:
-; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
-; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
-; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
-; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
-; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
-; IR-NEXT: br i1 %18, label %Flow7, label %bb14
-
-; IR: Flow2:
-; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
-; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
-; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
-; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
-; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
-; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
-; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
-; IR-NEXT: br i1 %25, label %bb21, label %Flow3
-
-; IR: bb21:
-; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %27 = xor i1 %tmp12, true
-; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
-; IR-NEXT: br label %Flow3
+; IR: bb14.lr.ph:
+; IR: br label %bb14
 
 ; IR: Flow3:
-; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
-; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
-; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
-; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
-; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
-; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
+; IR:   call void @llvm.amdgcn.end.cf(i64 %18)
+; IR:   %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %17)
+; IR:   %1 = extractvalue { i1, i64 } %0, 0
+; IR:   %2 = extractvalue { i1, i64 } %0, 1
+; IR:   br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
+
+; IR: bb4.bb13_crit_edge:
+; IR:   br label %Flow4
+
+; IR: Flow4:
+; IR:   %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
+; IR:   call void @llvm.amdgcn.end.cf(i64 %2)
+; IR:   br label %Flow
+
+; IR: bb13:
+; IR:   br label %bb31
+
+; IR: Flow:
+; IR:   %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
+; IR:   %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
+; IR:   %6 = extractvalue { i1, i64 } %5, 0
+; IR:   %7 = extractvalue { i1, i64 } %5, 1
+; IR:   br i1 %6, label %bb13, label %bb31
+
+; IR: bb14:
+; IR:   %phi.broken = phi i64 [ %18, %Flow2 ], [ 0, %bb14.lr.ph ]
+; IR:   %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %16, %Flow2 ]
+; IR:   %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %15, %Flow2 ]
+; IR:   %tmp15 = icmp eq i32 %tmp1037, 1
+; IR:   %8 = xor i1 %tmp15, true
+; IR:   %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
+; IR:   %10 = extractvalue { i1, i64 } %9, 0
+; IR:   %11 = extractvalue { i1, i64 } %9, 1
+; IR:   br i1 %10, label %bb31.loopexit, label %Flow1
+
+; IR: Flow1:
+; IR:   %12 = call { i1, i64 } @llvm.amdgcn.else(i64 %11)
+; IR:   %13 = extractvalue { i1, i64 } %12, 0
+; IR:   %14 = extractvalue { i1, i64 } %12, 1
+; IR:   br i1 %13, label %bb16, label %Flow2
+
+; IR: bb16:
+; IR:   %tmp17 = bitcast i64 %tmp3 to <2 x i32>
+; IR:   br label %bb18
+
+; IR: Flow2:
+; IR:   %loop.phi = phi i64 [ %21, %bb21 ], [ %phi.broken, %Flow1 ]
+; IR:   %15 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %Flow1 ]
+; IR:   %16 = phi i32 [ %tmp10, %bb21 ], [ undef, %Flow1 ]
+; IR:   %17 = phi i1 [ %20, %bb21 ], [ false, %Flow1 ]
+; IR:   %18 = call i64 @llvm.amdgcn.else.break(i64 %14, i64 %loop.phi)
+; IR:   call void @llvm.amdgcn.end.cf(i64 %14)
+; IR:   %19 = call i1 @llvm.amdgcn.loop(i64 %18)
+; IR:   br i1 %19, label %Flow3, label %bb14
+
+; IR: bb18:
+; IR:   %tmp19 = load volatile i32, i32 addrspace(1)* undef
+; IR:   %tmp20 = icmp slt i32 %tmp19, 9
+; IR:   br i1 %tmp20, label %bb21, label %bb18
+
+; IR: bb21:
+; IR:   %tmp22 = extractelement <2 x i32> %tmp17, i64 1
+; IR:   %tmp23 = lshr i32 %tmp22, 16
+; IR:   %tmp24 = select i1 undef, i32 undef, i32 %tmp23
+; IR:   %tmp25 = uitofp i32 %tmp24 to float
+; IR:   %tmp26 = fmul float %tmp25, 0x3EF0001000000000
+; IR:   %tmp27 = fsub float %tmp26, undef
+; IR:   %tmp28 = fcmp olt float %tmp27, 5.000000e-01
+; IR:   %tmp29 = select i1 %tmp28, i64 1, i64 2
+; IR:   %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
+; IR:   %tmp7 = zext i32 %tmp30 to i64
+; IR:   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
+; IR:   %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
+; IR:   %tmp10 = extractelement <4 x i32> %tmp9, i64 0
+; IR:   %tmp11 = load volatile i32, i32 addrspace(1)* undef
+; IR:   %tmp12 = icmp slt i32 %tmp11, 9
+; IR:   %20 = xor i1 %tmp12, true
+; IR:   %21 = call i64 @llvm.amdgcn.if.break(i1 %20, i64 %phi.broken)
+; IR:   br label %Flow2
+
+; IR: bb31.loopexit:
+; IR:   br label %Flow1
 
 ; IR: bb31:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
-; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
-; IR-NEXT: ret void
+; IR:   call void @llvm.amdgcn.end.cf(i64 %7)
+; IR:   store volatile i32 0, i32 addrspace(1)* undef
+; IR:   ret void
 
 
 ; GCN-LABEL: {{^}}nested_loop_conditions:
diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
new file mode 100644
index 0000000..4a9c916
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
@@ -0,0 +1,301 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg %s | FileCheck %s
+
+; StructurizeCFG::orderNodes used an arbitrary and nonsensical sorting
+; function which broke the basic backedge identification algorithm. It
+; would use RPO order, but then do a weird partial sort by the loop
+; depth assuming blocks are sorted by loop. However a block can appear
+; in between blocks of a loop that is not part of a loop, breaking the
+; assumption of the sort.
+;
+; The collectInfos must be done in RPO order. The actual
+; structurization order I think is less important, but unless the loop
+; headers are identified in RPO order, it finds the wrong set of back
+; edges.
+
+define amdgpu_kernel void @loop_backedge_misidentified(i32 addrspace(1)* %arg0) #0 {
+; CHECK-LABEL: @loop_backedge_misidentified(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+; CHECK-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG0:%.*]], i32 [[TID]]
+; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       LOOP.HEADER:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP10:%.*]], [[FLOW4:%.*]] ]
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB62:%.*]], label [[FLOW:%.*]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       bb18:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP24:%.*]] = urem i32 [[TMP22]], 52
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
+; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW4]]
+; CHECK:       INNER_LOOP:
+; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    [[INNER_LOOP_J_INC]] = add nsw i32 [[INNER_LOOP_J]], 1
+; CHECK-NEXT:    [[INNER_LOOP_CMP:%.*]] = icmp eq i32 [[INNER_LOOP_J]], 0
+; CHECK-NEXT:    br i1 [[INNER_LOOP_CMP]], label [[INNER_LOOP_BREAK]], label [[INNER_LOOP]]
+; CHECK:       INNER_LOOP_BREAK:
+; CHECK-NEXT:    [[TMP59]] = extractelement <4 x i32> [[TMP14]], i64 2
+; CHECK-NEXT:    call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+; CHECK-NEXT:    br label [[FLOW3:%.*]]
+; CHECK:       bb62:
+; CHECK-NEXT:    [[LOAD13:%.*]] = icmp ult i32 [[TMP16]], 271
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[LOAD13]], true
+; CHECK-NEXT:    br i1 [[TMP3]], label [[INCREMENT_I:%.*]], label [[FLOW1:%.*]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[INC_I:%.*]], [[INCREMENT_I]] ], [ undef, [[BB62]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ true, [[INCREMENT_I]] ], [ false, [[BB62]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ false, [[INCREMENT_I]] ], [ true, [[BB62]] ]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[BB64:%.*]], label [[FLOW2:%.*]]
+; CHECK:       bb64:
+; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #0
+; CHECK-NEXT:    br label [[FLOW2]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP7]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ false, [[FLOW2]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB18]], label [[FLOW3]]
+; CHECK:       INCREMENT_I:
+; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       END_ELSE_BLOCK:
+; CHECK-NEXT:    [[I_FINAL:%.*]] = phi i32 [ [[TMP1]], [[FLOW3]] ]
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1337
+; CHECK-NEXT:    [[CMP_END_ELSE_BLOCK:%.*]] = icmp eq i32 [[I_FINAL]], -1
+; CHECK-NEXT:    br label [[FLOW4]]
+; CHECK:       Flow4:
+; CHECK-NEXT:    [[TMP10]] = phi i32 [ [[I_FINAL]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW3]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW3]] ]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[RETURN:%.*]], label [[LOOP_HEADER]]
+; CHECK:       RETURN:
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x99
+; CHECK-NEXT:    store volatile <2 x float> [[LOAD1]], <2 x float> addrspace(1)* undef, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+  %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
+  %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
+  br label %LOOP.HEADER
+
+LOOP.HEADER:
+  %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
+  call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
+  %tmp12 = zext i32 %i to i64
+  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
+  %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
+  %tmp15 = extractelement <4 x i32> %tmp14, i64 0
+  %tmp16 = and i32 %tmp15, 65535
+  %tmp17 = icmp eq i32 %tmp16, 1
+  br i1 %tmp17, label %bb18, label %bb62
+
+bb18:
+  %tmp19 = extractelement <2 x i32> %tmp, i64 0
+  %tmp22 = lshr i32 %tmp19, 16
+  %tmp24 = urem i32 %tmp22, 52
+  %tmp25 = mul nuw nsw i32 %tmp24, 52
+  br label %INNER_LOOP
+
+INNER_LOOP:
+  %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
+  call void asm sideeffect "; inner loop body", ""() #0
+  %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
+  %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
+  br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
+
+INNER_LOOP_BREAK:
+  %tmp59 = extractelement <4 x i32> %tmp14, i64 2
+  call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+  br label %END_ELSE_BLOCK
+
+bb62:
+  %load13 = icmp ult i32 %tmp16, 271
+  br i1 %load13, label %bb64, label %INCREMENT_I
+
+bb64:
+  call void asm sideeffect "s_nop 42", "~{memory}"() #0
+  br label %RETURN
+
+INCREMENT_I:
+  %inc.i = add i32 %i, 1
+  call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
+  br label %END_ELSE_BLOCK
+
+END_ELSE_BLOCK:
+  %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
+  call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
+  %cmp.end.else.block = icmp eq i32 %i.final, -1
+  br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
+
+RETURN:
+  call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
+  store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
+  ret void
+}
+
+; The same function, except break to return block goes directly to the
+; return, which managed to hide the bug.
+define amdgpu_kernel void @loop_backedge_misidentified_alt(i32 addrspace(1)* %arg0) #0 {
+; CHECK-LABEL: @loop_backedge_misidentified_alt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+; CHECK-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG0:%.*]], i32 [[TID]]
+; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       LOOP.HEADER:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP9:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB62:%.*]], label [[FLOW:%.*]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[INC_I:%.*]], [[INCREMENT_I:%.*]] ], [ undef, [[BB62]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ true, [[INCREMENT_I]] ], [ false, [[BB62]] ]
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       bb18:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP24:%.*]] = urem i32 [[TMP22]], 52
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
+; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP7:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]]
+; CHECK:       INNER_LOOP:
+; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    [[INNER_LOOP_J_INC]] = add nsw i32 [[INNER_LOOP_J]], 1
+; CHECK-NEXT:    [[INNER_LOOP_CMP:%.*]] = icmp eq i32 [[INNER_LOOP_J]], 0
+; CHECK-NEXT:    br i1 [[INNER_LOOP_CMP]], label [[INNER_LOOP_BREAK]], label [[INNER_LOOP]]
+; CHECK:       INNER_LOOP_BREAK:
+; CHECK-NEXT:    [[TMP59]] = extractelement <4 x i32> [[TMP14]], i64 2
+; CHECK-NEXT:    call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+; CHECK-NEXT:    br label [[FLOW2:%.*]]
+; CHECK:       bb62:
+; CHECK-NEXT:    [[LOAD13:%.*]] = icmp ult i32 [[TMP16]], 271
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i1 [[LOAD13]], true
+; CHECK-NEXT:    br i1 [[TMP5]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
+; CHECK:       bb64:
+; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #0
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[TMP1]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[BB18]], label [[FLOW2]]
+; CHECK:       INCREMENT_I:
+; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       END_ELSE_BLOCK:
+; CHECK-NEXT:    [[I_FINAL:%.*]] = phi i32 [ [[TMP3]], [[FLOW2]] ]
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1337
+; CHECK-NEXT:    [[CMP_END_ELSE_BLOCK:%.*]] = icmp eq i32 [[I_FINAL]], -1
+; CHECK-NEXT:    br label [[FLOW3]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP9]] = phi i32 [ [[I_FINAL]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[RETURN]], label [[LOOP_HEADER]]
+; CHECK:       RETURN:
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x99
+; CHECK-NEXT:    store volatile <2 x float> [[LOAD1]], <2 x float> addrspace(1)* undef, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+  %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
+  %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
+  br label %LOOP.HEADER
+
+LOOP.HEADER:
+  %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
+  call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
+  %tmp12 = zext i32 %i to i64
+  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
+  %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
+  %tmp15 = extractelement <4 x i32> %tmp14, i64 0
+  %tmp16 = and i32 %tmp15, 65535
+  %tmp17 = icmp eq i32 %tmp16, 1
+  br i1 %tmp17, label %bb18, label %bb62
+
+bb18:
+  %tmp19 = extractelement <2 x i32> %tmp, i64 0
+  %tmp22 = lshr i32 %tmp19, 16
+  %tmp24 = urem i32 %tmp22, 52
+  %tmp25 = mul nuw nsw i32 %tmp24, 52
+  br label %INNER_LOOP
+
+INNER_LOOP:
+  %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
+  call void asm sideeffect "; inner loop body", ""() #0
+  %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
+  %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
+  br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
+
+INNER_LOOP_BREAK:
+  %tmp59 = extractelement <4 x i32> %tmp14, i64 2
+  call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+  br label %END_ELSE_BLOCK
+
+bb62:
+  %load13 = icmp ult i32 %tmp16, 271
+  ;br i1 %load13, label %bb64, label %INCREMENT_I
+  ; branching directly to the return avoids the bug
+  br i1 %load13, label %RETURN, label %INCREMENT_I
+
+
+bb64:
+  call void asm sideeffect "s_nop 42", "~{memory}"() #0
+  br label %RETURN
+
+INCREMENT_I:
+  %inc.i = add i32 %i, 1
+  call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
+  br label %END_ELSE_BLOCK
+
+END_ELSE_BLOCK:
+  %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
+  call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
+  %cmp.end.else.block = icmp eq i32 %i.final, -1
+  br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
+
+RETURN:
+  call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
+  store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone }
diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg b/llvm/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000..2a665f0
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll b/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
index 58634d0..7b5bd5a 100644
--- a/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
+++ b/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
@@ -1,32 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
 
 define void @main(float addrspace(1)* %out) {
-
-; CHECK: main_body:
-; CHECK: br label %LOOP.outer
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    br label [[LOOP_OUTER:%.*]]
+; CHECK:       LOOP.outer:
+; CHECK-NEXT:    [[TEMP8_0_PH:%.*]] = phi float [ 0.000000e+00, [[MAIN_BODY:%.*]] ], [ [[TMP13:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    [[TEMP4_0_PH:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP12:%.*]], [[FLOW3]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       LOOP:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP12]], [[FLOW:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ undef, [[LOOP_OUTER]] ], [ [[TMP13]], [[FLOW]] ]
+; CHECK-NEXT:    [[TEMP4_0:%.*]] = phi i32 [ [[TEMP4_0_PH]], [[LOOP_OUTER]] ], [ [[TMP15:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TEMP4_0]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP22]], true
+; CHECK-NEXT:    br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[TEMP8_0_PH]], [[IF29:%.*]] ], [ [[TMP9:%.*]], [[FLOW1:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP20]], [[IF29]] ], [ undef, [[FLOW1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[TMP32:%.*]], [[IF29]] ], [ true, [[FLOW1]] ]
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    br i1 [[TMP16:%.*]], label [[ENDLOOP:%.*]], label [[LOOP_OUTER]]
+; CHECK:       ENDLOOP:
+; CHECK-NEXT:    [[TEMP8_1:%.*]] = phi float [ [[TMP14:%.*]], [[FLOW3]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP20]], 3
+; CHECK-NEXT:    [[DOT45:%.*]] = select i1 [[TMP23]], float 0.000000e+00, float 1.000000e+00
+; CHECK-NEXT:    store float [[DOT45]], float addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    ret void
+; CHECK:       ENDIF:
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP20]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP31]], true
+; CHECK-NEXT:    br i1 [[TMP6]], label [[ENDIF28:%.*]], label [[FLOW1]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP20]], [[ENDIF28]] ], [ [[TMP0]], [[ENDIF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP35:%.*]], [[ENDIF28]] ], [ [[TMP1]], [[ENDIF]] ]
+; CHECK-NEXT:    [[TMP9]] = phi float [ [[TMP35]], [[ENDIF28]] ], [ [[TEMP8_0_PH]], [[ENDIF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ [[TMP36:%.*]], [[ENDIF28]] ], [ true, [[ENDIF]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ false, [[ENDIF28]] ], [ true, [[ENDIF]] ]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[IF29]], label [[FLOW2:%.*]]
+; CHECK:       IF29:
+; CHECK-NEXT:    [[TMP32]] = icmp sgt i32 [[TMP20]], 2
+; CHECK-NEXT:    br label [[FLOW2]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP12]] = phi i32 [ [[TMP7]], [[FLOW2]] ], [ [[TMP0]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP13]] = phi float [ [[TMP8]], [[FLOW2]] ], [ [[TMP1]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP14]] = phi float [ [[TMP3]], [[FLOW2]] ], [ [[TEMP8_0_PH]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP15]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP]] ]
+; CHECK-NEXT:    [[TMP16]] = phi i1 [ [[TMP10]], [[FLOW2]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[FLOW3]], label [[LOOP]]
+; CHECK:       ENDIF28:
+; CHECK-NEXT:    [[TMP35]] = fadd float [[TEMP8_0_PH]], 1.000000e+00
+; CHECK-NEXT:    [[TMP36]] = icmp sgt i32 [[TMP20]], 2
+; CHECK-NEXT:    br label [[FLOW1]]
+;
 main_body:
   br label %LOOP.outer
 
-; CHECK: LOOP.outer:
-; CHECK: br label %LOOP
 LOOP.outer:                                       ; preds = %ENDIF28, %main_body
   %temp8.0.ph = phi float [ 0.000000e+00, %main_body ], [ %tmp35, %ENDIF28 ]
   %temp4.0.ph = phi i32 [ 0, %main_body ], [ %tmp20, %ENDIF28 ]
   br label %LOOP
 
-; CHECK: LOOP:
-; br i1 %{{[0-9]+}}, label %ENDIF, label %Flow
 LOOP:                                             ; preds = %IF29, %LOOP.outer
   %temp4.0 = phi i32 [ %temp4.0.ph, %LOOP.outer ], [ %tmp20, %IF29 ]
   %tmp20 = add i32 %temp4.0, 1
   %tmp22 = icmp sgt i32 %tmp20, 3
   br i1 %tmp22, label %ENDLOOP, label %ENDIF
 
-; CHECK: Flow3
-; CHECK: br i1 %{{[0-9]+}}, label %ENDLOOP, label %LOOP.outer
-
-; CHECK: ENDLOOP:
-; CHECK: ret void
 ENDLOOP:                                          ; preds = %ENDIF28, %IF29, %LOOP
   %temp8.1 = phi float [ %temp8.0.ph, %LOOP ], [ %temp8.0.ph, %IF29 ], [ %tmp35, %ENDIF28 ]
   %tmp23 = icmp eq i32 %tmp20, 3
@@ -34,29 +78,14 @@
   store float %.45, float addrspace(1)* %out
   ret void
 
-; CHECK: ENDIF:
-; CHECK: br i1 %tmp31, label %IF29, label %Flow1
 ENDIF:                                            ; preds = %LOOP
   %tmp31 = icmp sgt i32 %tmp20, 1
   br i1 %tmp31, label %IF29, label %ENDIF28
 
-; CHECK: Flow:
-; CHECK: br i1 %{{[0-9]+}}, label %Flow2, label %LOOP
-
-; CHECK: IF29:
-; CHECK: br label %Flow1
 IF29:                                             ; preds = %ENDIF
   %tmp32 = icmp sgt i32 %tmp20, 2
   br i1 %tmp32, label %ENDLOOP, label %LOOP
 
-; CHECK: Flow1:
-; CHECK: br label %Flow
-
-; CHECK: Flow2:
-; CHECK: br i1 %{{[0-9]+}}, label %ENDIF28, label %Flow3
-
-; CHECK: ENDIF28:
-; CHECK: br label %Flow3
 ENDIF28:                                          ; preds = %ENDIF
   %tmp35 = fadd float %temp8.0.ph, 1.0
   %tmp36 = icmp sgt i32 %tmp20, 2