[LoopInterchange] Preserve LCSSA.

This patch extends LoopInterchange to move LCSSA to the right place
after interchanging. This is required for LoopInterchange to become a
function pass.

An alternative to the manual moving of the PHIs, we could also re-form
the LCSSA phis for a set of interchanged loops, but that's more
expensive.

Reviewers: efriedma, mcrosier, davide

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D52154

llvm-svn: 343132
diff --git a/llvm/test/Transforms/LoopInterchange/interchangeable.ll b/llvm/test/Transforms/LoopInterchange/interchangeable.ll
index 44985d8..a97981c 100644
--- a/llvm/test/Transforms/LoopInterchange/interchangeable.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchangeable.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -S | FileCheck %s
+; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopInterchange/lcssa.ll b/llvm/test/Transforms/LoopInterchange/lcssa.ll
index b44c240..8886cf4 100644
--- a/llvm/test/Transforms/LoopInterchange/lcssa.ll
+++ b/llvm/test/Transforms/LoopInterchange/lcssa.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t
 ; RUN: cat %t |  FileCheck --check-prefix REMARK %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -17,20 +17,20 @@
 ; REMARK: UnsupportedExitPHI
 ; REMARK-NEXT: lcssa_01
 
-define void @lcssa_01(){
+define void @lcssa_01() {
 entry:
   %cmp21 = icmp sgt i64 100, 1
   br i1 %cmp21, label %outer.ph, label %for.end16
 
-outer.ph:
+outer.ph:                                         ; preds = %entry
   %cmp218 = icmp sgt i64 100, 1
   br label %outer.header
 
-outer.header:
-  %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ]
+outer.header:                                     ; preds = %outer.inc, %outer.ph
+  %iv.outer = phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ]
   br i1 %cmp218, label %for.body3, label %outer.inc
 
-for.body3:
+for.body3:                                        ; preds = %for.body3, %outer.header
   %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
   %vA = load i32, i32* %arrayidx5
@@ -42,35 +42,36 @@
   %exitcond = icmp eq i64 %iv.inner.next, 100
   br i1 %exitcond, label %outer.inc, label %for.body3
 
-outer.inc:
+outer.inc:                                        ; preds = %for.body3, %outer.header
   %iv.outer.next = add nsw i64 %iv.outer, 1
   %cmp = icmp eq i64 %iv.outer.next, 100
   br i1 %cmp, label %outer.header, label %for.exit
 
-for.exit:
-  store i64 %iv.outer.next, i64 * @Y
+for.exit:                                         ; preds = %outer.inc
+  %iv.outer.next.lcssa = phi i64 [ %iv.outer.next, %outer.inc ]
+  store i64 %iv.outer.next.lcssa, i64* @Y
   br label %for.end16
 
-for.end16:
+for.end16:                                        ; preds = %for.exit, %entry
   ret void
 }
 
 ; REMARK: UnsupportedExitPHI
 ; REMARK-NEXT: lcssa_02
-define void @lcssa_02(){
+define void @lcssa_02() {
 entry:
   %cmp21 = icmp sgt i64 100, 1
   br i1 %cmp21, label %outer.ph, label %for.end16
 
-outer.ph:
+outer.ph:                                         ; preds = %entry
   %cmp218 = icmp sgt i64 100, 1
   br label %outer.header
 
-outer.header:
-  %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ]
+outer.header:                                     ; preds = %outer.inc, %outer.ph
+  %iv.outer = phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ]
   br i1 %cmp218, label %for.body3, label %outer.inc
 
-for.body3:
+for.body3:                                        ; preds = %for.body3, %outer.header
   %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
   %vA = load i32, i32* %arrayidx5
@@ -82,32 +83,32 @@
   %exitcond = icmp eq i64 %iv.inner.next, 100
   br i1 %exitcond, label %outer.inc, label %for.body3
 
-outer.inc:
+outer.inc:                                        ; preds = %for.body3, %outer.header
   %iv.inner.end = phi i64 [ 0, %outer.header ], [ %iv.inner.next, %for.body3 ]
   %iv.outer.next = add nsw i64 %iv.outer, 1
   %cmp = icmp eq i64 %iv.outer.next, 100
   br i1 %cmp, label %outer.header, label %for.exit
 
-for.exit:
-  store i64 %iv.inner.end, i64 * @Y
+for.exit:                                         ; preds = %outer.inc
+  %iv.inner.end.lcssa = phi i64 [ %iv.inner.end, %outer.inc ]
+  store i64 %iv.inner.end.lcssa, i64* @Y
   br label %for.end16
 
-for.end16:
+for.end16:                                        ; preds = %for.exit, %entry
   ret void
 }
 
-
 ; REMARK: Interchanged
 ; REMARK-NEXT: lcssa_03
-define void @lcssa_03(){
+define void @lcssa_03() {
 entry:
   br label %outer.header
 
-outer.header:
-  %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
   br label %for.body3
 
-for.body3:
+for.body3:                                        ; preds = %for.body3, %outer.header
   %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
   %vA = load i32, i32* %arrayidx5
@@ -119,16 +120,18 @@
   %exitcond = icmp eq i64 %iv.inner.next, 100
   br i1 %exitcond, label %outer.inc, label %for.body3
 
-outer.inc:
+outer.inc:                                        ; preds = %for.body3
+  %iv.inner.lcssa = phi i64 [ %iv.inner, %for.body3 ]
   %iv.outer.next = add nsw i64 %iv.outer, 1
   %cmp = icmp eq i64 %iv.outer.next, 100
   br i1 %cmp, label %outer.header, label %for.exit
 
-for.exit:
-  store i64 %iv.inner, i64 * @Y
+for.exit:                                         ; preds = %outer.inc
+  %iv.inner.lcssa.lcssa = phi i64 [ %iv.inner.lcssa, %outer.inc ]
+  store i64 %iv.inner.lcssa.lcssa, i64* @Y
   br label %for.end16
 
-for.end16:
+for.end16:                                        ; preds = %for.exit
   ret void
 }
 
@@ -136,16 +139,17 @@
 ;        types, as we fail to detect floating point reductions for now.
 ; REMARK: UnsupportedPHIOuter
 ; REMARK-NEXT: lcssa_04
-define void @lcssa_04(){
+
+define void @lcssa_04() {
 entry:
   br label %outer.header
 
-outer.header:
-  %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
-  %float.outer= phi float [ 1.0, %entry ], [ 2.0, %outer.inc ]
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  %float.outer = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %outer.inc ]
   br label %for.body3
 
-for.body3:
+for.body3:                                        ; preds = %for.body3, %outer.header
   %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
   %vA = load i32, i32* %arrayidx5
@@ -157,15 +161,141 @@
   %exitcond = icmp eq i64 %iv.inner.next, 100
   br i1 %exitcond, label %outer.inc, label %for.body3
 
-outer.inc:
+outer.inc:                                        ; preds = %for.body3
   %iv.outer.next = add nsw i64 %iv.outer, 1
   %cmp = icmp eq i64 %iv.outer.next, 100
   br i1 %cmp, label %outer.header, label %for.exit
 
-for.exit:
-  store float %float.outer, float* @F
+for.exit:                                         ; preds = %outer.inc
+  %float.outer.lcssa = phi float [ %float.outer, %outer.inc ]
+  store float %float.outer.lcssa, float* @F
   br label %for.end16
 
-for.end16:
+for.end16:                                        ; preds = %for.exit
+  ret void
+}
+
+; PHI node in inner latch with multiple predecessors.
+; REMARK: Interchanged
+; REMARK-NEXT: lcssa_05
+
+define void @lcssa_05(i32* %ptr) {
+entry:
+  br label %outer.header
+
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %bb3, %outer.header
+  %iv.inner = phi i64 [ %iv.inner.next, %bb3 ], [ 1, %outer.header ]
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %for.body3
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vC = load i32, i32* %arrayidx9
+  %add = add nsw i32 %vA, %vC
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %for.body3
+  %addp = phi i32 [ %add, %bb2 ], [ 0, %for.body3 ]
+  store i32 %addp, i32* %ptr
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:                                        ; preds = %bb3
+  %iv.inner.lcssa = phi i64 [ %iv.inner, %bb3 ]
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:                                         ; preds = %outer.inc
+  %iv.inner.lcssa.lcssa = phi i64 [ %iv.inner.lcssa, %outer.inc ]
+  store i64 %iv.inner.lcssa.lcssa, i64* @Y
+  br label %for.end16
+
+for.end16:                                        ; preds = %for.exit
+  ret void
+}
+
+; REMARK: UnsupportedExitPHI
+; REMARK-NEXT: lcssa_06
+
+define void @lcssa_06(i64* %ptr, i32* %ptr1) {
+entry:
+  br label %outer.header
+
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  br i1 undef, label %for.body3, label %outer.inc
+
+for.body3:                                        ; preds = %for.body3, %outer.header
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vC = load i32, i32* %arrayidx9
+  %add = add nsw i32 %vA, %vC
+  store i32 %add, i32* %ptr1
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:                                        ; preds = %for.body3, %outer.header
+  %sv = phi i64 [ 0, %outer.header ], [ 1, %for.body3 ]
+  store i64 %sv, i64* %ptr
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:                                         ; preds = %outer.inc
+  %sv.lcssa = phi i64 [ %sv, %outer.inc ]
+  store i64 %sv.lcssa, i64* @Y
+  br label %for.end16
+
+for.end16:                                        ; preds = %for.exit
+  ret void
+}
+
+; REMARK: Interchanged
+; REMARK-NEXT: lcssa_07
+define void @lcssa_07() {
+entry:
+  br label %outer.header
+
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %outer.header
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vC = load i32, i32* %arrayidx9
+  %add = add nsw i32 %vA, %vC
+  store i32 %add, i32* %arrayidx5
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.bb, label %for.body3
+
+outer.bb:                                         ; preds = %for.body3
+  %iv.inner.lcssa = phi i64 [ %iv.inner, %for.body3 ]
+  br label %outer.inc
+
+outer.inc:                                        ; preds = %outer.bb
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:                                         ; preds = %outer.inc
+  %iv.inner.lcssa.lcssa = phi i64 [ %iv.inner.lcssa, %outer.inc ]
+  store i64 %iv.inner.lcssa.lcssa, i64* @Y
+  br label %for.end16
+
+for.end16:                                        ; preds = %for.exit
   ret void
 }
diff --git a/llvm/test/Transforms/LoopInterchange/phi-ordering.ll b/llvm/test/Transforms/LoopInterchange/phi-ordering.ll
index 05c2112..c741697 100644
--- a/llvm/test/Transforms/LoopInterchange/phi-ordering.ll
+++ b/llvm/test/Transforms/LoopInterchange/phi-ordering.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S 2>&1 | FileCheck %s
 ;; Checks the order of the inner phi nodes does not cause havoc.
 ;; The inner loop has a reduction into c. The IV is not the first phi.
 
diff --git a/llvm/test/Transforms/LoopInterchange/reductions.ll b/llvm/test/Transforms/LoopInterchange/reductions.ll
index da92276..28a2d8d 100644
--- a/llvm/test/Transforms/LoopInterchange/reductions.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -S -debug 2>&1 | FileCheck %s
+; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S -debug 2>&1 | FileCheck %s
 
 @A = common global [500 x [500 x i32]] zeroinitializer
 @X = common global i32 0
@@ -18,7 +18,7 @@
   %cmp16 = icmp sgt i32 %N, 1
   br i1 %cmp16, label %for.body3.lr.ph, label %for.end8
 
-for.body3.lr.ph:                                  ; preds = %entry, %for.cond1.for.inc6_crit_edge
+for.body3.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
   %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
   %X.promoted = load i32, i32* @X
   br label %for.body3
@@ -35,7 +35,8 @@
   br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body3
 
 for.cond1.for.inc6_crit_edge:                     ; preds = %for.body3
-  store i32 %add, i32* @X
+  %add.lcssa = phi i32 [ %add, %for.body3 ]
+  store i32 %add.lcssa, i32* @X
   %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
   %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
   %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
@@ -56,12 +57,12 @@
 ;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi.
 ; CHECK: Loops interchanged.
 
-define void @reduction_02(i32 %N)  {
+define void @reduction_02(i32 %N) {
 entry:
   %cmp34 = icmp sgt i32 %N, 1
   br i1 %cmp34, label %for.cond4.preheader.preheader, label %for.end19
 
-for.cond4.preheader.preheader:                    ; preds = %entry, %for.inc17
+for.cond4.preheader.preheader:                    ; preds = %for.inc17, %entry
   %indvars.iv40 = phi i64 [ %indvars.iv.next41, %for.inc17 ], [ 1, %entry ]
   br label %for.body6.lr.ph
 
@@ -87,20 +88,25 @@
   br i1 %exitcond, label %for.cond4.for.inc14_crit_edge, label %for.body6
 
 for.cond4.for.inc14_crit_edge:                    ; preds = %for.body6
-  store i32 %add, i32* @X
-  store i32 %add13, i32* @Y
+  %add.lcssa = phi i32 [ %add, %for.body6 ]
+  %add13.lcssa = phi i32 [ %add13, %for.body6 ]
+  store i32 %add.lcssa, i32* @X
+  store i32 %add13.lcssa, i32* @Y
   %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1
   %lftr.wideiv38 = trunc i64 %indvars.iv.next37 to i32
   %exitcond39 = icmp eq i32 %lftr.wideiv38, %N
   br i1 %exitcond39, label %for.inc17, label %for.body6.lr.ph
 
 for.inc17:                                        ; preds = %for.cond4.for.inc14_crit_edge
+  %add.lcssa.lcssa = phi i32 [ %add.lcssa, %for.cond4.for.inc14_crit_edge ]
   %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
   %lftr.wideiv42 = trunc i64 %indvars.iv.next41 to i32
   %exitcond43 = icmp eq i32 %lftr.wideiv42, %N
   br i1 %exitcond43, label %for.end19, label %for.cond4.preheader.preheader
 
 for.end19:                                        ; preds = %for.inc17, %entry
+  %res1 = phi i32 [ 0, %entry ], [ %add.lcssa.lcssa, %for.inc17 ]
+  store i32 %res1, i32* @X
   ret void
 }
 
@@ -117,17 +123,17 @@
 ;; Not interchanged hence the phi's in the inner loop will not be split.
 ; CHECK: Outer loops with reductions are not supported currently.
 
-define void @reduction_03(i32 %N)  {
+define void @reduction_03(i32 %N) {
 entry:
   %cmp35 = icmp sgt i32 %N, 1
   br i1 %cmp35, label %for.cond4.preheader.lr.ph, label %for.end19
 
-for.cond4.preheader.lr.ph:                        ; preds = %entry, %for.cond1.for.inc17_crit_edge
+for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
   %indvars.iv41 = phi i64 [ %indvars.iv.next42, %for.cond1.for.inc17_crit_edge ], [ 1, %entry ]
   %Y.promoted = load i32, i32* @Y
   br label %for.body6.lr.ph
 
-for.body6.lr.ph:                                  ; preds = %for.cond4.preheader.lr.ph, %for.cond4.for.end_crit_edge
+for.body6.lr.ph:                                  ; preds = %for.cond4.for.end_crit_edge, %for.cond4.preheader.lr.ph
   %indvars.iv37 = phi i64 [ 1, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next38, %for.cond4.for.end_crit_edge ]
   %add1334 = phi i32 [ %Y.promoted, %for.cond4.preheader.lr.ph ], [ %add13, %for.cond4.for.end_crit_edge ]
   %X.promoted = load i32, i32* @X
@@ -145,7 +151,8 @@
   br i1 %exitcond, label %for.cond4.for.end_crit_edge, label %for.body6
 
 for.cond4.for.end_crit_edge:                      ; preds = %for.body6
-  store i32 %add, i32* @X
+  %add.lcssa = phi i32 [ %add, %for.body6 ]
+  store i32 %add.lcssa, i32* @X
   %arrayidx12 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @B, i64 0, i64 %indvars.iv37, i64 %indvars.iv41
   %1 = load i32, i32* %arrayidx12
   %add13 = add nsw i32 %add1334, %1
@@ -155,7 +162,8 @@
   br i1 %exitcond40, label %for.cond1.for.inc17_crit_edge, label %for.body6.lr.ph
 
 for.cond1.for.inc17_crit_edge:                    ; preds = %for.cond4.for.end_crit_edge
-  store i32 %add13, i32* @Y
+  %add13.lcssa = phi i32 [ %add13, %for.cond4.for.end_crit_edge ]
+  store i32 %add13.lcssa, i32* @Y
   %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
   %lftr.wideiv43 = trunc i64 %indvars.iv.next42 to i32
   %exitcond44 = icmp eq i32 %lftr.wideiv43, %N
@@ -181,7 +189,7 @@
   %cmp28 = icmp sgt i32 %N, 1
   br i1 %cmp28, label %for.cond4.preheader.preheader, label %for.end15
 
-for.cond4.preheader.preheader:                    ; preds = %entry, %for.inc13
+for.cond4.preheader.preheader:                    ; preds = %for.inc13, %entry
   %i.029 = phi i32 [ %inc14, %for.inc13 ], [ 1, %entry ]
   br label %for.body6.lr.ph
 
@@ -205,8 +213,10 @@
   br i1 %exitcond, label %for.cond4.for.inc10_crit_edge, label %for.body6
 
 for.cond4.for.inc10_crit_edge:                    ; preds = %for.body6
-  store i32 %add, i32* @X
-  store i32 %add9, i32* @Y
+  %add.lcssa = phi i32 [ %add, %for.body6 ]
+  %add9.lcssa = phi i32 [ %add9, %for.body6 ]
+  store i32 %add.lcssa, i32* @X
+  store i32 %add9.lcssa, i32* @Y
   %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 1
   %lftr.wideiv32 = trunc i64 %indvars.iv.next31 to i32
   %exitcond33 = icmp eq i32 %lftr.wideiv32, %N
@@ -231,7 +241,7 @@
   %cmp16 = icmp sgt i32 %N, 1
   br i1 %cmp16, label %for.body7.lr.ph, label %for.end8
 
-for.body7.lr.ph:                                  ; preds = %entry, %for.cond1.for.inc6_crit_edge
+for.body7.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
   %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
   %X.promoted = load i32, i32* @X
   br label %for.body7
@@ -248,15 +258,15 @@
   br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body7
 
 for.cond1.for.inc6_crit_edge:                     ; preds = %for.body7
-  store i32 %add, i32* @X
+  %add.lcssa = phi i32 [ %add, %for.body7 ]
+  store i32 %add.lcssa, i32* @X
   %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
   %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
   %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
   br i1 %exitcond21, label %for.end8, label %for.body7.lr.ph
 
 for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  %add.res = phi i32 [ %add, %for.cond1.for.inc6_crit_edge], [ 0, %entry ]
+  %add.res = phi i32 [ %add.lcssa, %for.cond1.for.inc6_crit_edge ], [ 0, %entry ]
   store i32 %add.res, i32* @Y
-
   ret void
 }