[UnrollAndJam] New Unroll and Jam pass
This is a simple implementation of the unroll-and-jam classical loop
optimisation.
The basic idea is that we take an outer loop of the form:
for i..
ForeBlocks(i)
for j..
SubLoopBlocks(i, j)
AftBlocks(i)
Instead of doing normal inner or outer unrolling, we unroll as follows:
for i... i+=2
ForeBlocks(i)
ForeBlocks(i+1)
for j..
SubLoopBlocks(i, j)
SubLoopBlocks(i+1, j)
AftBlocks(i)
AftBlocks(i+1)
Remainder Loop
So we have unrolled the outer loop, then jammed the two inner loops into
one. This can lead to a simpler inner loop if memory accesses can be shared
between the now jammed loops.
To do this we have to prove that this is all safe, both for the memory
accesses (using dependence analysis) and that ForeBlocks(i+1) can move before
AftBlocks(i) and SubLoopBlocks(i, j).
Differential Revision: https://reviews.llvm.org/D41953
llvm-svn: 336062
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
new file mode 100644
index 0000000..8906830
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
@@ -0,0 +1,470 @@
+; RUN: opt -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: fore_aft_less
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @fore_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %add72 = add nuw nsw i32 %i, -1
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: fore_aft_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @fore_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %add72 = add nuw nsw i32 %i, 0
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: fore_aft_more
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @fore_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %add72 = add nuw nsw i32 %i, 1
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: fore_sub_less
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @fore_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add72 = add nuw nsw i32 %i, -1
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %add6 = add nuw nsw i32 %j, 1
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: fore_sub_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @fore_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add72 = add nuw nsw i32 %i, 0
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %add6 = add nuw nsw i32 %j, 1
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: fore_sub_more
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @fore_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add72 = add nuw nsw i32 %i, 1
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %add6 = add nuw nsw i32 %j, 1
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: sub_aft_less
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @sub_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %add72 = add nuw nsw i32 %i, -1
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: sub_aft_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @sub_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %add72 = add nuw nsw i32 %i, 0
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: sub_aft_more
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @sub_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %add72 = add nuw nsw i32 %i, 1
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: sub_sub_less
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @sub_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ %add72 = add nuw nsw i32 %i, -1
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: sub_sub_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ %add72 = add nuw nsw i32 %i, 0
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
+
+
+; CHECK-LABEL: sub_sub_more
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @sub_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+ %cmp = icmp sgt i32 %N, 0
+ br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+ %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+ %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+ %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx5, align 4
+ %mul = mul nsw i32 %0, %i
+ %add = add nsw i32 %mul, %sum
+ %add6 = add nuw nsw i32 %j, 1
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 1, i32* %arrayidx, align 4
+ %add72 = add nuw nsw i32 %i, 1
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+ store i32 %add, i32* %arrayidx8, align 4
+ %exitcond = icmp eq i32 %add6, %N
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add7 = add nuw nsw i32 %i, 1
+ %exitcond29 = icmp eq i32 %add7, %N
+ br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/disable.ll b/llvm/test/Transforms/LoopUnrollAndJam/disable.ll
new file mode 100644
index 0000000..4a00937
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnrollAndJam/disable.ll
@@ -0,0 +1,741 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+;; Common check for all tests. None should be unroll and jammed
+; CHECK-NOT: remark: {{.*}} unroll and jammed
+
+
+; CHECK-LABEL: disabled1
+; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i+1] = sum; }
+; A[i] to A[i+1] dependency should block unrollandjam
+define void @disabled1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
+; CHECK: %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp127 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp127, %cmp
+ br i1 %or.cond, label %for.preheader, label %return
+
+for.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
+ %b.028 = phi i32 [ %inc8, %for.latch ], [ 1, %for.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.029
+ %0 = load i32, i32* %arrayidx, align 4
+ br label %for.inner
+
+for.inner:
+ %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1.025 = phi i32 [ %0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.026
+ %1 = load i32, i32* %arrayidx6, align 4
+ %add = add i32 %1, %sum1.025
+ %inc = add nuw i32 %j.026, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %b.028
+ store i32 %add, i32* %arrayidx7, align 4
+ %inc8 = add nuw nsw i32 %b.028, 1
+ %add10 = add nuw nsw i32 %i.029, 1
+ %exitcond30 = icmp eq i32 %add10, %I
+ br i1 %exitcond30, label %return, label %for.outer
+
+return:
+ ret void
+}
+
+
+; CHECK-LABEL: disabled2
+; Tests an incompatible block layout (for.outer jumps past for.inner)
+; FIXME: Make this work
+define void @disabled2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
+; CHECK: %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp131 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp131, %cmp
+ br i1 %or.cond, label %for.preheader, label %for.end14
+
+for.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.032
+ %0 = load i32, i32* %arrayidx, align 4
+ %tobool = icmp eq i32 %0, 0
+ br i1 %tobool, label %for.latch, label %for.inner
+
+for.inner:
+ %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.outer ]
+ %sum1.029 = phi i32 [ %sum1.1, %for.inner ], [ 0, %for.outer ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.030
+ %1 = load i32, i32* %arrayidx6, align 4
+ %tobool7 = icmp eq i32 %1, 0
+ %sub = add i32 %sum1.029, 10
+ %add = sub i32 %sub, %1
+ %sum1.1 = select i1 %tobool7, i32 %sum1.029, i32 %add
+ %inc = add nuw i32 %j.030, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %sum1.1.lcssa = phi i32 [ 0, %for.outer ], [ %sum1.1, %for.inner ]
+ %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %i.032
+ store i32 %sum1.1.lcssa, i32* %arrayidx11, align 4
+ %add13 = add nuw i32 %i.032, 1
+ %exitcond33 = icmp eq i32 %add13, %I
+ br i1 %exitcond33, label %for.end14, label %for.outer
+
+for.end14:
+ ret void
+}
+
+
+; CHECK-LABEL: disabled3
+; Tests loop carry dependencies in an array S
+define void @disabled3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
+; CHECK: %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %S = alloca [4 x i32], align 4
+ %cmp = icmp eq i32 %J, 0
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %0 = bitcast [4 x i32]* %S to i8*
+ %cmp128 = icmp eq i32 %I, 0
+ br i1 %cmp128, label %for.cond.cleanup, label %for.preheader
+
+for.preheader:
+ %arrayidx9 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 0
+ br label %for.outer
+
+for.cond.cleanup:
+ br label %return
+
+for.outer:
+ %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
+ br label %for.inner
+
+for.inner:
+ %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.027
+ %l2 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %j.027, %i.029
+ %rem = urem i32 %add, %J
+ %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %rem
+ %l3 = load i32, i32* %arrayidx6, align 4
+ %mul = mul i32 %l3, %l2
+ %rem7 = urem i32 %j.027, 3
+ %arrayidx8 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 %rem7
+ store i32 %mul, i32* %arrayidx8, align 4
+ %inc = add nuw i32 %j.027, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %l1 = load i32, i32* %arrayidx9, align 4
+ %arrayidx10 = getelementptr inbounds i32, i32* %A, i32 %i.029
+ store i32 %l1, i32* %arrayidx10, align 4
+ %add12 = add nuw i32 %i.029, 1
+ %exitcond31 = icmp eq i32 %add12, %I
+ br i1 %exitcond31, label %for.cond.cleanup, label %for.outer
+
+return:
+ ret void
+}
+
+
+; CHECK-LABEL: disabled4
+; Inner looop induction variable is not consistent
+; ie for(i = 0..n) for (j = 0..i) sum+=B[j]
+define void @disabled4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
+; CHECK: %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ugt i32 %I, 1
+ %or.cond = and i1 %cmp122, %cmp
+ br i1 %or.cond, label %for.preheader, label %for.end9
+
+for.preheader:
+ br label %for.outer
+
+for.outer:
+ %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1.020 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.021
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1.020
+ %inc = add nuw i32 %j.021, 1
+ %exitcond = icmp eq i32 %inc, %indvars.iv
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+ store i32 %add, i32* %arrayidx6, align 4
+ %indvars.iv.next = add nuw i32 %indvars.iv, 1
+ %exitcond24 = icmp eq i32 %indvars.iv.next, %I
+ br i1 %exitcond24, label %for.end9, label %for.outer
+
+for.end9:
+ ret void
+}
+
+
+; CHECK-LABEL: disabled5
+; Test odd uses of phi nodes where the outer IV cannot be moved into Fore as it hits a PHI
+@f = hidden global i32 0, align 4
+define i32 @disabled5() #0 {
+; CHECK: %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
+; CHECK: %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
+entry:
+ %f.promoted10 = load i32, i32* @f, align 4
+ br label %for.outer
+
+for.outer:
+ %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
+ %d.018 = phi i16 [ 0, %entry ], [ %odd.lcssa, %for.latch ]
+ %inc5.sink9 = phi i32 [ 2, %entry ], [ %inc5, %for.latch ]
+ br label %for.inner
+
+for.inner:
+ %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
+ %inc.sink8 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %inc = add nuw nsw i32 %inc.sink8, 1
+ %exitcond = icmp ne i32 %inc, 7
+ br i1 %exitcond, label %for.inner, label %for.latch
+
+for.latch:
+ %.lcssa = phi i32 [ %1, %for.inner ]
+ %odd.lcssa = phi i16 [ 1, %for.inner ]
+ %inc5 = add nuw nsw i32 %inc5.sink9, 1
+ %exitcond11 = icmp ne i32 %inc5, 7
+ br i1 %exitcond11, label %for.outer, label %for.end
+
+for.end:
+ %.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
+ %inc.lcssa.lcssa = phi i32 [ 7, %for.latch ]
+ ret i32 0
+}
+
+
+; CHECK-LABEL: disabled6
+; There is a dependency in here, between @d and %0 (=@f)
+@d6 = hidden global i16 5, align 2
+@f6 = hidden global i16* @d6, align 4
+define i32 @disabled6() #0 {
+; CHECK: %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
+; CHECK: %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
+entry:
+ store i16 1, i16* @d6, align 2
+ %0 = load i16*, i16** @f6, align 4
+ br label %for.body.i
+
+for.body.i:
+ %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
+ %1 = load i16, i16* %0, align 2
+ br label %for.body6.i
+
+for.cond.cleanup.i:
+ %inc8.i = add nuw nsw i16 %inc8.sink14.i, 1
+ store i16 %inc8.i, i16* @d6, align 2
+ %cmp.i = icmp ult i16 %inc8.i, 6
+ br i1 %cmp.i, label %for.body.i, label %test.exit
+
+for.body6.i:
+ %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
+ %inc.i = add nuw nsw i32 %c.013.i, 1
+ %exitcond.i = icmp eq i32 %inc.i, 7
+ br i1 %exitcond.i, label %for.cond.cleanup.i, label %for.body6.i
+
+test.exit:
+ %conv2.i = sext i16 %1 to i32
+ ret i32 0
+}
+
+
+; CHECK-LABEL: disabled7
+; Has negative output dependency
+define void @disabled7(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
+; CHECK: %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp127 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp127, %cmp
+ br i1 %or.cond, label %for.body.preheader, label %for.end12
+
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.028
+ store i32 0, i32* %arrayidx, align 4
+ %sub = add i32 %i.028, -1
+ %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %sub
+ store i32 2, i32* %arrayidx2, align 4
+ br label %for.body6
+
+for.cond3.for.cond.cleanup5_crit_edge:
+ store i32 %add, i32* %arrayidx, align 4
+ %add11 = add nuw i32 %i.028, 1
+ %exitcond29 = icmp eq i32 %add11, %I
+ br i1 %exitcond29, label %for.end12, label %for.body
+
+for.body6:
+ %0 = phi i32 [ 0, %for.body ], [ %add, %for.body6 ]
+ %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
+ %arrayidx7 = getelementptr inbounds i32, i32* %B, i32 %j.026
+ %1 = load i32, i32* %arrayidx7, align 4
+ %add = add i32 %1, %0
+ %add9 = add nuw i32 %j.026, 1
+ %exitcond = icmp eq i32 %add9, %J
+ br i1 %exitcond, label %for.cond3.for.cond.cleanup5_crit_edge, label %for.body6
+
+for.end12:
+ ret void
+}
+
+
+; CHECK-LABEL: disabled8
+; Same as above with an extra outer loop nest
+define void @disabled8(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
+; CHECK: %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
+entry:
+ %cmp = icmp eq i32 %J, 0
+ %cmp335 = icmp eq i32 %I, 0
+ %or.cond = or i1 %cmp, %cmp335
+ br i1 %or.cond, label %for.end18, label %for.body.preheader
+
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %x.037 = phi i32 [ %inc, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
+ br label %for.outer
+
+for.cond.cleanup4:
+ %inc = add nuw nsw i32 %x.037, 1
+ %exitcond40 = icmp eq i32 %inc, 5
+ br i1 %exitcond40, label %for.end18, label %for.body
+
+for.outer:
+ %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.036
+ store i32 0, i32* %arrayidx, align 4
+ %sub = add i32 %i.036, -1
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %sub
+ store i32 2, i32* %arrayidx6, align 4
+ br label %for.inner
+
+for.latch:
+ store i32 %add, i32* %arrayidx, align 4
+ %add15 = add nuw i32 %i.036, 1
+ %exitcond38 = icmp eq i32 %add15, %I
+ br i1 %exitcond38, label %for.cond.cleanup4, label %for.outer
+
+for.inner:
+ %0 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
+ %arrayidx11 = getelementptr inbounds i32, i32* %B, i32 %j.034
+ %1 = load i32, i32* %arrayidx11, align 4
+ %add = add i32 %1, %0
+ %add13 = add nuw i32 %j.034, 1
+ %exitcond = icmp eq i32 %add13, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.end18:
+ ret void
+}
+
+
+; CHECK-LABEL: disabled9
+; Can't prove alias between A and B
+define void @disabled9(i32 %I, i32 %J, i32* nocapture %A, i32* nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: disable10
+; Simple call
+declare void @f10(i32, i32) #0
+define void @disable10(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ tail call void @f10(i32 %i, i32 %j) nounwind
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: disable11
+; volatile
+define void @disable11(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load volatile i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: disable12
+; Multiple aft blocks
+define void @disable12(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch3 ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch3 ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %cmpl = icmp eq i32 %add.lcssa, 10
+ br i1 %cmpl, label %for.latch2, label %for.latch3
+
+for.latch2:
+ br label %for.latch3
+
+for.latch3:
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: disable13
+; Two subloops
+define void @disable13(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK: %j2 = phi i32 [ %inc2, %for.inner2 ], [ 0, %for.inner2.preheader ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.inner2, label %for.inner
+
+for.inner2:
+ %j2 = phi i32 [ 0, %for.inner ], [ %inc2, %for.inner2 ]
+ %sum12 = phi i32 [ 0, %for.inner ], [ %add2, %for.inner2 ]
+ %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %j2
+ %l0 = load i32, i32* %arrayidx2, align 4
+ %add2 = add i32 %l0, %sum12
+ %inc2 = add nuw i32 %j2, 1
+ %exitcond2 = icmp eq i32 %inc2, %J
+ br i1 %exitcond2, label %for.latch, label %for.inner2
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner2 ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: disable14
+; Multiple exits blocks
+define void @disable14(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ %add8 = add nuw i32 %i, 1
+ %exitcond23 = icmp eq i32 %add8, %I
+ br i1 %exitcond23, label %for.end.loopexit, label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: disable15
+; Latch != exit
+define void @disable15(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ br label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: disable16
+; Cannot move other before inner loop
+define void @disable16(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ %otherphi = phi i32 [ %other, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ %loadarr = getelementptr inbounds i32, i32* %A, i32 %i
+ %load = load i32, i32* %arrayidx6, align 4
+ %other = add i32 %otherphi, %load
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/pragma.ll b/llvm/test/Transforms/LoopUnrollAndJam/pragma.ll
new file mode 100644
index 0000000..d45a04c
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnrollAndJam/pragma.ll
@@ -0,0 +1,319 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime < %s -S | FileCheck %s
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-and-jam-threshold=15 < %s -S | FileCheck %s --check-prefix=CHECK-LOWTHRES
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: test1
+; Basic check that these loops are by default UnJ'd
+define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+ %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %add.us = add i32 %0, %sum1.us
+ %inc.us = add nuw i32 %j.us, 1
+ %exitcond = icmp eq i32 %inc.us, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+ %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+ store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+ %add8.us = add nuw i32 %i.us, 1
+ %exitcond25 = icmp eq i32 %add8.us, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: nounroll_and_jam
+; #pragma nounroll_and_jam
+define void @nounroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+ %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %add.us = add i32 %0, %sum1.us
+ %inc.us = add nuw i32 %j.us, 1
+ %exitcond = icmp eq i32 %inc.us, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+ %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+ store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+ %add8.us = add nuw i32 %i.us, 1
+ %exitcond25 = icmp eq i32 %add8.us, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !1
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: unroll_and_jam_count
+; #pragma unroll_and_jam(8)
+define void @unroll_and_jam_count(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.7, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+ %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %add.us = add i32 %0, %sum1.us
+ %inc.us = add nuw i32 %j.us, 1
+ %exitcond = icmp eq i32 %inc.us, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+ %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+ store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+ %add8.us = add nuw i32 %i.us, 1
+ %exitcond25 = icmp eq i32 %add8.us, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !3
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: unroll_and_jam
+; #pragma unroll_and_jam
+define void @unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+ %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %add.us = add i32 %0, %sum1.us
+ %inc.us = add nuw i32 %j.us, 1
+ %exitcond = icmp eq i32 %inc.us, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+ %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+ store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+ %add8.us = add nuw i32 %i.us, 1
+ %exitcond25 = icmp eq i32 %add8.us, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !5
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: nounroll
+; #pragma nounroll (which we take to mean disable unroll and jam too)
+define void @nounroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+ %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %add.us = add i32 %0, %sum1.us
+ %inc.us = add nuw i32 %j.us, 1
+ %exitcond = icmp eq i32 %inc.us, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+ %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+ store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+ %add8.us = add nuw i32 %i.us, 1
+ %exitcond25 = icmp eq i32 %add8.us, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !7
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: unroll
+; #pragma unroll (which we take to mean disable unroll and jam)
+define void @unroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+ %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %add.us = add i32 %0, %sum1.us
+ %inc.us = add nuw i32 %j.us, 1
+ %exitcond = icmp eq i32 %inc.us, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+ %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+ store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+ %add8.us = add nuw i32 %i.us, 1
+ %exitcond25 = icmp eq i32 %add8.us, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !9
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: nounroll_plus_unroll_and_jam
+; #pragma clang loop nounroll, unroll_and_jam (which we take to mean do unroll_and_jam)
+define void @nounroll_plus_unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+ %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %add.us = add i32 %0, %sum1.us
+ %inc.us = add nuw i32 %j.us, 1
+ %exitcond = icmp eq i32 %inc.us, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+ %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+ store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+ %add8.us = add nuw i32 %i.us, 1
+ %exitcond25 = icmp eq i32 %add8.us, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !11
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = distinct !{!"llvm.loop.unroll_and_jam.disable"}
+!3 = distinct !{!3, !4}
+!4 = distinct !{!"llvm.loop.unroll_and_jam.count", i32 8}
+!5 = distinct !{!5, !6}
+!6 = distinct !{!"llvm.loop.unroll_and_jam.enable"}
+!7 = distinct !{!7, !8}
+!8 = distinct !{!"llvm.loop.unroll.disable"}
+!9 = distinct !{!9, !10}
+!10 = distinct !{!"llvm.loop.unroll.enable"}
+!11 = distinct !{!11, !8, !6}
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unprofitable.ll b/llvm/test/Transforms/LoopUnrollAndJam/unprofitable.ll
new file mode 100644
index 0000000..64dbab8
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnrollAndJam/unprofitable.ll
@@ -0,0 +1,217 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -pass-remarks=loop-unroll < %s -S 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8m.main-arm-none-eabi"
+
+;; Common check for all tests. None should be unroll and jammed due to profitability
+; CHECK-NOT: remark: {{.*}} unroll and jammed
+
+
+; CHECK-LABEL: unprof1
+; Multiple inner loop blocks
+define void @unprof1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner2 ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner2 ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner2 ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+br label %for.inner2
+
+for.inner2:
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner2 ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %addinc = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %addinc, %I
+ br i1 %exitcond25, label %for.loopexit, label %for.outer
+
+for.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: unprof2
+; Constant inner loop count
+define void @unprof2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, 10
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %addinc = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %addinc, %I
+ br i1 %exitcond25, label %for.loopexit, label %for.outer
+
+for.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: unprof3
+; Complex inner loop
+define void @unprof3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %add0 = add i32 %0, %sum1
+ %add1 = add i32 %0, %sum1
+ %add2 = add i32 %0, %sum1
+ %add3 = add i32 %0, %sum1
+ %add4 = add i32 %0, %sum1
+ %add5 = add i32 %0, %sum1
+ %add6 = add i32 %0, %sum1
+ %add7 = add i32 %0, %sum1
+ %add8 = add i32 %0, %sum1
+ %add9 = add i32 %0, %sum1
+ %add10 = add i32 %0, %sum1
+ %add11 = add i32 %0, %sum1
+ %add12 = add i32 %0, %sum1
+ %add13 = add i32 %0, %sum1
+ %add14 = add i32 %0, %sum1
+ %add15 = add i32 %0, %sum1
+ %add16 = add i32 %0, %sum1
+ %add17 = add i32 %0, %sum1
+ %add18 = add i32 %0, %sum1
+ %add19 = add i32 %0, %sum1
+ %add20 = add i32 %0, %sum1
+ %add21 = add i32 %0, %sum1
+ %add22 = add i32 %0, %sum1
+ %add23 = add i32 %0, %sum1
+ %add24 = add i32 %0, %sum1
+ %add25 = add i32 %0, %sum1
+ %add26 = add i32 %0, %sum1
+ %add27 = add i32 %0, %sum1
+ %add28 = add i32 %0, %sum1
+ %add29 = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %addinc = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %addinc, %I
+ br i1 %exitcond25, label %for.loopexit, label %for.outer
+
+for.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: unprof4
+; No loop invariant loads
+define void @unprof4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp122 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp122
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %j2 = add i32 %j, %i
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j2
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add i32 %0, %sum1
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4
+ %addinc = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %addinc, %I
+ br i1 %exitcond25, label %for.loopexit, label %for.outer
+
+for.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
new file mode 100644
index 0000000..bdb47c2
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
@@ -0,0 +1,735 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: test1
+; Tests for(i) { sum = 0; for(j) sum += B[j]; A[i] = sum; }
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[J:%.*]], 0
+; CHECK-NEXT: [[CMPJ:%.*]] = icmp ne i32 [[I:%.*]], 0
+; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMPJ]]
+; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK: for.outer.preheader:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[I]], -1
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[I]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_OUTER_PREHEADER_NEW:%.*]]
+; CHECK: for.outer.preheader.new:
+; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
+; CHECK-NEXT: br label [[FOR_OUTER:%.*]]
+; CHECK: for.outer:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[ADD8_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTER_PREHEADER_NEW]] ]
+; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[FOR_LATCH]] ]
+; CHECK-NEXT: [[ADD8:%.*]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
+; CHECK-NEXT: [[ADD8_1:%.*]] = add nuw nsw i32 [[ADD8]], 1
+; CHECK-NEXT: [[NITER_NSUB_1:%.*]] = sub i32 [[NITER_NSUB]], 1
+; CHECK-NEXT: [[ADD8_2:%.*]] = add nuw nsw i32 [[ADD8_1]], 1
+; CHECK-NEXT: [[NITER_NSUB_2:%.*]] = sub i32 [[NITER_NSUB_1]], 1
+; CHECK-NEXT: [[ADD8_3]] = add nuw i32 [[ADD8_2]], 1
+; CHECK-NEXT: [[NITER_NSUB_3]] = sub i32 [[NITER_NSUB_2]], 1
+; CHECK-NEXT: br label [[FOR_INNER:%.*]]
+; CHECK: for.inner:
+; CHECK-NEXT: [[J_0:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_1:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_2:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_3:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[J_0]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD]] = add i32 [[TMP2]], [[SUM]]
+; CHECK-NEXT: [[INC]] = add nuw i32 [[J_0]], 1
+; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD_1]] = add i32 [[TMP3]], [[SUM_1]]
+; CHECK-NEXT: [[INC_1]] = add nuw i32 [[J_1]], 1
+; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD_2]] = add i32 [[TMP4]], [[SUM_2]]
+; CHECK-NEXT: [[INC_2]] = add nuw i32 [[J_2]], 1
+; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_3]]
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD_3]] = add i32 [[TMP5]], [[SUM_3]]
+; CHECK-NEXT: [[INC_3]] = add nuw i32 [[J_3]], 1
+; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[J]]
+; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
+; CHECK: for.latch:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I]]
+; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* [[ARRAYIDX6]], align 4, !tbaa !0
+; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8]]
+; CHECK-NEXT: store i32 [[ADD_LCSSA_1]], i32* [[ARRAYIDX6_1]], align 4, !tbaa !0
+; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_1]]
+; CHECK-NEXT: store i32 [[ADD_LCSSA_2]], i32* [[ARRAYIDX6_2]], align 4, !tbaa !0
+; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_2]]
+; CHECK-NEXT: store i32 [[ADD_LCSSA_3]], i32* [[ARRAYIDX6_3]], align 4, !tbaa !0
+; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NSUB_3]], 0
+; CHECK-NEXT: br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop !4
+; CHECK: for.end.loopexit.unr-lcssa.loopexit:
+; CHECK-NEXT: [[I_UNR_PH:%.*]] = phi i32 [ [[ADD8_3]], [[FOR_LATCH]] ]
+; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
+; CHECK: for.end.loopexit.unr-lcssa:
+; CHECK-NEXT: [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK: for.outer.epil.preheader:
+; CHECK-NEXT: br label [[FOR_OUTER_EPIL:%.*]]
+; CHECK: for.outer.epil:
+; CHECK-NEXT: br label [[FOR_INNER_EPIL:%.*]]
+; CHECK: for.inner.epil:
+; CHECK-NEXT: [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT: [[SUM_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL]]
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_EPIL]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD_EPIL]] = add i32 [[TMP6]], [[SUM_EPIL]]
+; CHECK-NEXT: [[INC_EPIL]] = add nuw i32 [[J_EPIL]], 1
+; CHECK-NEXT: [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[INC_EPIL]], [[J]]
+; CHECK-NEXT: br i1 [[EXITCOND_EPIL]], label [[FOR_LATCH_EPIL:%.*]], label [[FOR_INNER_EPIL]]
+; CHECK: for.latch.epil:
+; CHECK-NEXT: [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT: [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_UNR]]
+; CHECK-NEXT: store i32 [[ADD_LCSSA_EPIL]], i32* [[ARRAYIDX6_EPIL]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD8_EPIL:%.*]] = add nuw i32 [[I_UNR]], 1
+; CHECK-NEXT: [[EPIL_ITER_SUB:%.*]] = sub i32 [[XTRAITER]], 1
+; CHECK-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_SUB]], 0
+; CHECK-NEXT: br i1 [[EPIL_ITER_CMP]], label [[FOR_OUTER_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]]
+; CHECK: for.end.loopexit.epilog-lcssa:
+; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]]
+; CHECK: for.end.loopexit:
+; CHECK-NEXT: br label [[FOR_END]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+; CHECK: for.outer.epil.1:
+; CHECK-NEXT: br label [[FOR_INNER_EPIL_1:%.*]]
+; CHECK: for.inner.epil.1:
+; CHECK-NEXT: [[J_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT: [[SUM_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL_1]]
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_1]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD_EPIL_1]] = add i32 [[TMP7]], [[SUM_EPIL_1]]
+; CHECK-NEXT: [[INC_EPIL_1]] = add nuw i32 [[J_EPIL_1]], 1
+; CHECK-NEXT: [[EXITCOND_EPIL_1:%.*]] = icmp eq i32 [[INC_EPIL_1]], [[J]]
+; CHECK-NEXT: br i1 [[EXITCOND_EPIL_1]], label [[FOR_LATCH_EPIL_1:%.*]], label [[FOR_INNER_EPIL_1]]
+; CHECK: for.latch.epil.1:
+; CHECK-NEXT: [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT: [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_EPIL]]
+; CHECK-NEXT: store i32 [[ADD_LCSSA_EPIL_1]], i32* [[ARRAYIDX6_EPIL_1]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD8_EPIL_1:%.*]] = add nuw i32 [[ADD8_EPIL]], 1
+; CHECK-NEXT: [[EPIL_ITER_SUB_1:%.*]] = sub i32 [[EPIL_ITER_SUB]], 1
+; CHECK-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 [[EPIL_ITER_SUB_1]], 0
+; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_1]], label [[FOR_OUTER_EPIL_2:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK: for.outer.epil.2:
+; CHECK-NEXT: br label [[FOR_INNER_EPIL_2:%.*]]
+; CHECK: for.inner.epil.2:
+; CHECK-NEXT: [[J_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT: [[SUM_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL_2]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_2]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD_EPIL_2]] = add i32 [[TMP8]], [[SUM_EPIL_2]]
+; CHECK-NEXT: [[INC_EPIL_2]] = add nuw i32 [[J_EPIL_2]], 1
+; CHECK-NEXT: [[EXITCOND_EPIL_2:%.*]] = icmp eq i32 [[INC_EPIL_2]], [[J]]
+; CHECK-NEXT: br i1 [[EXITCOND_EPIL_2]], label [[FOR_LATCH_EPIL_2:%.*]], label [[FOR_INNER_EPIL_2]]
+; CHECK: for.latch.epil.2:
+; CHECK-NEXT: [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT: [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_EPIL_1]]
+; CHECK-NEXT: store i32 [[ADD_LCSSA_EPIL_2]], i32* [[ARRAYIDX6_EPIL_2]], align 4, !tbaa !0
+; CHECK-NEXT: [[ADD8_EPIL_2:%.*]] = add nuw i32 [[ADD8_EPIL_1]], 1
+; CHECK-NEXT: [[EPIL_ITER_SUB_2:%.*]] = sub i32 [[EPIL_ITER_SUB_1]], 1
+; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmpJ = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmpJ
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+ %add = add i32 %0, %sum
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4, !tbaa !5
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: test2
+; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i] = sum; }
+; A[i] load/store dependency should not block unroll-and-jam
+; CHECK: for.outer:
+; CHECK: %i = phi i32 [ %add9.3, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK: %niter = phi i32 [ %unroll_iter, %for.outer.preheader.new ], [ %niter.nsub.3, %for.latch ]
+; CHECK: br label %for.inner
+; CHECK: for.inner:
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK: %sum = phi i32 [ %2, %for.outer ], [ %add, %for.inner ]
+; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
+; CHECK: %sum.1 = phi i32 [ %3, %for.outer ], [ %add.1, %for.inner ]
+; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
+; CHECK: %sum.2 = phi i32 [ %4, %for.outer ], [ %add.2, %for.inner ]
+; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
+; CHECK: %sum.3 = phi i32 [ %5, %for.outer ], [ %add.3, %for.inner ]
+; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK: %add.lcssa = phi i32 [ %add, %for.inner ]
+; CHECK: %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
+; CHECK: %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
+; CHECK: %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
+; CHECK: br i1 %niter.ncmp.3, label %for.end10.loopexit.unr-lcssa.loopexit, label %for.outer
+; CHECK: for.end10.loopexit.unr-lcssa.loopexit:
+define void @test2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp125 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmp125
+ br i1 %or.cond, label %for.outer.preheader, label %for.end10
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add9, %for.latch ], [ 0, %for.outer.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum = phi i32 [ %0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j
+ %1 = load i32, i32* %arrayidx6, align 4, !tbaa !5
+ %add = add i32 %1, %sum
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ store i32 %add.lcssa, i32* %arrayidx, align 4, !tbaa !5
+ %add9 = add nuw i32 %i, 1
+ %exitcond28 = icmp eq i32 %add9, %I
+ br i1 %exitcond28, label %for.end10.loopexit, label %for.outer
+
+for.end10.loopexit:
+ br label %for.end10
+
+for.end10:
+ ret void
+}
+
+
+; CHECK-LABEL: test3
+; Tests Complete unroll-and-jam of the outer loop
+; CHECK: for.outer:
+; CHECK: br label %for.inner
+; CHECK: for.inner:
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
+; CHECK: %sum.1 = phi i32 [ 0, %for.outer ], [ %add.1, %for.inner ]
+; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
+; CHECK: %sum.2 = phi i32 [ 0, %for.outer ], [ %add.2, %for.inner ]
+; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
+; CHECK: %sum.3 = phi i32 [ 0, %for.outer ], [ %add.3, %for.inner ]
+; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK: %add.lcssa = phi i32 [ %add, %for.inner ]
+; CHECK: %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
+; CHECK: %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
+; CHECK: %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
+; CHECK: br label %for.end
+; CHECK: for.end:
+define void @test3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+ %cmp = icmp eq i32 %J, 0
+ br i1 %cmp, label %for.end, label %for.preheader
+
+for.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+ %sub = add i32 %sum, 10
+ %add = sub i32 %sub, %0
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add, i32* %arrayidx6, align 4, !tbaa !5
+ %add8 = add nuw nsw i32 %i, 1
+ %exitcond23 = icmp eq i32 %add8, 4
+ br i1 %exitcond23, label %for.end, label %for.outer
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: test4
+; Tests Complete unroll-and-jam with a trip count of 1
+; CHECK: for.outer:
+; CHECK: br label %for.inner
+; CHECK: for.inner:
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+; CHECK: br i1 %exitcond, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK: %add.lcssa = phi i32 [ %add, %for.inner ]
+; CHECK: br label %for.end
+; CHECK: for.end:
+define void @test4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+ %cmp = icmp eq i32 %J, 0
+ br i1 %cmp, label %for.end, label %for.preheader
+
+for.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+ %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+ %sub = add i32 %sum, 10
+ %add = sub i32 %sub, %0
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add, i32* %arrayidx6, align 4, !tbaa !5
+ %add8 = add nuw nsw i32 %i, 1
+ %exitcond23 = icmp eq i32 %add8, 1
+ br i1 %exitcond23, label %for.end, label %for.outer
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: test5
+; Multiple SubLoopBlocks
+; CHECK: for.outer:
+; CHECK: br label %for.inner
+; CHECK: for.inner:
+; CHECK: %inc8.sink15 = phi i32 [ 0, %for.outer ], [ %inc8, %for.inc.1 ]
+; CHECK: %inc8.sink15.1 = phi i32 [ 0, %for.outer ], [ %inc8.1, %for.inc.1 ]
+; CHECK: br label %for.inner2
+; CHECK: for.inner2:
+; CHECK: br i1 %tobool, label %for.cond4, label %for.inc
+; CHECK: for.cond4:
+; CHECK: br i1 %tobool.1, label %for.cond4a, label %for.inc
+; CHECK: for.cond4a:
+; CHECK: br label %for.inc
+; CHECK: for.inc:
+; CHECK: br i1 %tobool.11, label %for.cond4.1, label %for.inc.1
+; CHECK: for.latch:
+; CHECK: br label %for.end
+; CHECK: for.end:
+; CHECK: ret i32 0
+; CHECK: for.cond4.1:
+; CHECK: br i1 %tobool.1.1, label %for.cond4a.1, label %for.inc.1
+; CHECK: for.cond4a.1:
+; CHECK: br label %for.inc.1
+; CHECK: for.inc.1:
+; CHECK: br i1 %exitcond.1, label %for.latch, label %for.inner
+@a = hidden global [1 x i32] zeroinitializer, align 4
+define i32 @test5() #0 {
+entry:
+ br label %for.outer
+
+for.outer:
+ %.sink16 = phi i32 [ 0, %entry ], [ %add, %for.latch ]
+ br label %for.inner
+
+for.inner:
+ %inc8.sink15 = phi i32 [ 0, %for.outer ], [ %inc8, %for.inc ]
+ br label %for.inner2
+
+for.inner2:
+ %l1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 0, i32 0), align 4
+ %tobool = icmp eq i32 %l1, 0
+ br i1 %tobool, label %for.cond4, label %for.inc
+
+for.cond4:
+ %l0 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 1, i32 0), align 4
+ %tobool.1 = icmp eq i32 %l0, 0
+ br i1 %tobool.1, label %for.cond4a, label %for.inc
+
+for.cond4a:
+ br label %for.inc
+
+for.inc:
+ %l2 = phi i32 [ 0, %for.inner2 ], [ 1, %for.cond4 ], [ 2, %for.cond4a ]
+ %inc8 = add nuw nsw i32 %inc8.sink15, 1
+ %exitcond = icmp eq i32 %inc8, 3
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %.lcssa = phi i32 [ %l2, %for.inc ]
+ %conv11 = and i32 %.sink16, 255
+ %add = add nuw nsw i32 %conv11, 4
+ %cmp = icmp eq i32 %add, 8
+ br i1 %cmp, label %for.end, label %for.outer
+
+for.end:
+ %.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
+ ret i32 0
+}
+
+
+; CHECK-LABEL: test6
+; Test odd uses of phi nodes
+; CHECK: for.outer:
+; CHECK: br label %for.inner
+; CHECK: for.inner:
+; CHECK: br i1 %exitcond.3, label %for.inner, label %for.latch
+; CHECK: for.latch:
+; CHECK: br label %for.end
+; CHECK: for.end:
+; CHECK: ret i32 0
+@f = hidden global i32 0, align 4
+define i32 @test6() #0 {
+entry:
+ %f.promoted10 = load i32, i32* @f, align 4, !tbaa !5
+ br label %for.outer
+
+for.outer:
+ %p0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
+ %inc5.sink9 = phi i32 [ 2, %entry ], [ %inc5, %for.latch ]
+ br label %for.inner
+
+for.inner:
+ %p1 = phi i32 [ %p0, %for.outer ], [ 2, %for.inner ]
+ %inc.sink8 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %inc = add nuw nsw i32 %inc.sink8, 1
+ %exitcond = icmp ne i32 %inc, 7
+ br i1 %exitcond, label %for.inner, label %for.latch
+
+for.latch:
+ %.lcssa = phi i32 [ %p1, %for.inner ]
+ %inc5 = add nuw nsw i32 %inc5.sink9, 1
+ %exitcond11 = icmp ne i32 %inc5, 7
+ br i1 %exitcond11, label %for.outer, label %for.end
+
+for.end:
+ %.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
+ %inc.lcssa.lcssa = phi i32 [ 7, %for.latch ]
+ ret i32 0
+}
+
+
+; CHECK-LABEL: test7
+; Has a positive dependency between two stores. Still valid.
+; The negative dependecy is in unroll-and-jam-disabled.ll
+; CHECK: for.outer:
+; CHECK: %i = phi i32 [ %add.3, %for.latch ], [ 0, %for.preheader.new ]
+; CHECK: %niter = phi i32 [ %unroll_iter, %for.preheader.new ], [ %niter.nsub.3, %for.latch ]
+; CHECK: br label %for.inner
+; CHECK: for.latch:
+; CHECK: %add9.lcssa = phi i32 [ %add9, %for.inner ]
+; CHECK: %add9.lcssa.1 = phi i32 [ %add9.1, %for.inner ]
+; CHECK: %add9.lcssa.2 = phi i32 [ %add9.2, %for.inner ]
+; CHECK: %add9.lcssa.3 = phi i32 [ %add9.3, %for.inner ]
+; CHECK: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit, label %for.outer
+; CHECK: for.inner:
+; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
+; CHECK: %sum.1 = phi i32 [ 0, %for.outer ], [ %add9.1, %for.inner ]
+; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %add10.1, %for.inner ]
+; CHECK: %sum.2 = phi i32 [ 0, %for.outer ], [ %add9.2, %for.inner ]
+; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %add10.2, %for.inner ]
+; CHECK: %sum.3 = phi i32 [ 0, %for.outer ], [ %add9.3, %for.inner ]
+; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %add10.3, %for.inner ]
+; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.end.loopexit.unr-lcssa.loopexit:
+define void @test7(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmp128 = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp128, %cmp
+ br i1 %or.cond, label %for.preheader, label %for.end
+
+for.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add, %for.latch ], [ 0, %for.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 0, i32* %arrayidx, align 4, !tbaa !5
+ %add = add nuw i32 %i, 1
+ %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %add
+ store i32 2, i32* %arrayidx2, align 4, !tbaa !5
+ br label %for.inner
+
+for.latch:
+ store i32 %add9, i32* %arrayidx, align 4, !tbaa !5
+ %exitcond30 = icmp eq i32 %add, %I
+ br i1 %exitcond30, label %for.end, label %for.outer
+
+for.inner:
+ %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
+ %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
+ %arrayidx7 = getelementptr inbounds i32, i32* %B, i32 %j
+ %l1 = load i32, i32* %arrayidx7, align 4, !tbaa !5
+ %add9 = add i32 %l1, %sum
+ %add10 = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %add10, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: test8
+; Same as test7 with an extra outer loop nest
+; CHECK: for.outest:
+; CHECK: br label %for.outer
+; CHECK: for.outer:
+; CHECK: %i = phi i32 [ %add.3, %for.latch ], [ 0, %for.outest.new ]
+; CHECK: %niter = phi i32 [ %unroll_iter, %for.outest.new ], [ %niter.nsub.3, %for.latch ]
+; CHECK: br label %for.inner
+; CHECK: for.inner:
+; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
+; CHECK: %sum.1 = phi i32 [ 0, %for.outer ], [ %add9.1, %for.inner ]
+; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %add10.1, %for.inner ]
+; CHECK: %sum.2 = phi i32 [ 0, %for.outer ], [ %add9.2, %for.inner ]
+; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %add10.2, %for.inner ]
+; CHECK: %sum.3 = phi i32 [ 0, %for.outer ], [ %add9.3, %for.inner ]
+; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %add10.3, %for.inner ]
+; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK: %add9.lcssa = phi i32 [ %add9, %for.inner ]
+; CHECK: %add9.lcssa.1 = phi i32 [ %add9.1, %for.inner ]
+; CHECK: %add9.lcssa.2 = phi i32 [ %add9.2, %for.inner ]
+; CHECK: %add9.lcssa.3 = phi i32 [ %add9.3, %for.inner ]
+; CHECK: br i1 %niter.ncmp.3, label %for.cleanup.unr-lcssa.loopexit, label %for.outer
+; CHECK: for.cleanup.epilog-lcssa:
+; CHECK: br label %for.cleanup
+; CHECK: for.cleanup:
+; CHECK: br i1 %exitcond41, label %for.end.loopexit, label %for.outest
+; CHECK: for.end.loopexit:
+; CHECK: br label %for.end
+define void @test8(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+ %cmp = icmp eq i32 %J, 0
+ %cmp336 = icmp eq i32 %I, 0
+ %or.cond = or i1 %cmp, %cmp336
+ br i1 %or.cond, label %for.end, label %for.preheader
+
+for.preheader:
+ br label %for.outest
+
+for.outest:
+ %x.038 = phi i32 [ %inc, %for.cleanup ], [ 0, %for.preheader ]
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add, %for.latch ], [ 0, %for.outest ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 0, i32* %arrayidx, align 4, !tbaa !5
+ %add = add nuw i32 %i, 1
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %add
+ store i32 2, i32* %arrayidx6, align 4, !tbaa !5
+ br label %for.inner
+
+for.inner:
+ %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
+ %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
+ %arrayidx11 = getelementptr inbounds i32, i32* %B, i32 %j
+ %l1 = load i32, i32* %arrayidx11, align 4, !tbaa !5
+ %add9 = add i32 %l1, %sum
+ %add10 = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %add10, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ store i32 %add9, i32* %arrayidx, align 4, !tbaa !5
+ %exitcond39 = icmp eq i32 %add, %I
+ br i1 %exitcond39, label %for.cleanup, label %for.outer
+
+for.cleanup:
+ %inc = add nuw nsw i32 %x.038, 1
+ %exitcond41 = icmp eq i32 %inc, 5
+ br i1 %exitcond41, label %for.end, label %for.outest
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: test9
+; Same as test1 with tbaa, not noalias
+; CHECK: for.outer:
+; CHECK: %i = phi i32 [ %add8.3, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK: %niter = phi i32 [ %unroll_iter, %for.outer.preheader.new ], [ %niter.nsub.3, %for.latch ]
+; CHECK: br label %for.inner
+; CHECK: for.inner:
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK: %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+; CHECK: %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
+; CHECK: %sum.1 = phi i32 [ 0, %for.outer ], [ %add.1, %for.inner ]
+; CHECK: %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
+; CHECK: %sum.2 = phi i32 [ 0, %for.outer ], [ %add.2, %for.inner ]
+; CHECK: %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
+; CHECK: %sum.3 = phi i32 [ 0, %for.outer ], [ %add.3, %for.inner ]
+; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK: %add.lcssa = phi i32 [ %add, %for.inner ]
+; CHECK: %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
+; CHECK: %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
+; CHECK: %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
+; CHECK: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit, label %for.outer
+; CHECK: for.end.loopexit.unr-lcssa.loopexit:
+define void @test9(i32 %I, i32 %J, i32* nocapture %A, i16* nocapture readonly %B) #0 {
+entry:
+ %cmp = icmp ne i32 %J, 0
+ %cmpJ = icmp ne i32 %I, 0
+ %or.cond = and i1 %cmp, %cmpJ
+ br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.outer
+
+for.outer:
+ %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+ %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+ %arrayidx = getelementptr inbounds i16, i16* %B, i32 %j
+ %0 = load i16, i16* %arrayidx, align 4, !tbaa !9
+ %sext = sext i16 %0 to i32
+ %add = add i32 %sext, %sum
+ %inc = add nuw i32 %j, 1
+ %exitcond = icmp eq i32 %inc, %J
+ br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+ %add.lcssa = phi i32 [ %add, %for.inner ]
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+ store i32 %add.lcssa, i32* %arrayidx6, align 4, !tbaa !5
+ %add8 = add nuw i32 %i, 1
+ %exitcond25 = icmp eq i32 %add8, %I
+ br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+
+; CHECK-LABEL: test10
+; Be careful not to incorrectly update the exit phi nodes
+; CHECK: %dec.lcssa.lcssa.ph.ph = phi i64 [ 0, %for.inc24 ]
+%struct.a = type { i64 }
+@g = common global %struct.a zeroinitializer, align 8
+@c = common global [1 x i8] zeroinitializer, align 1
+define signext i16 @test10(i32 %k) #0 {
+entry:
+ %0 = load i8, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @c, i64 0, i64 0), align 1
+ %tobool9 = icmp eq i8 %0, 0
+ %tobool13 = icmp ne i32 %k, 0
+ br label %for.body
+
+for.body:
+ %storemerge82 = phi i64 [ 0, %entry ], [ %inc25, %for.inc24 ]
+ br label %for.body2
+
+for.body2:
+ %storemerge = phi i64 [ 4, %for.body ], [ %dec, %for.inc21 ]
+ br i1 %tobool9, label %for.body2.split, label %for.body2.split2
+
+for.body2.split2:
+ br i1 %tobool13, label %for.inc21, label %for.inc21.if
+
+for.body2.split:
+ br i1 %tobool13, label %for.inc21, label %for.inc21.then
+
+for.inc21.if:
+ %storemerge.1 = phi i64 [ 0, %for.body2.split2 ]
+ br label %for.inc21
+
+for.inc21.then:
+ %storemerge.2 = phi i64 [ 0, %for.body2.split ]
+ %storemerge.3 = phi i32 [ 0, %for.body2.split ]
+ br label %for.inc21
+
+for.inc21:
+ %storemerge.4 = phi i64 [ %storemerge.1, %for.inc21.if ], [ %storemerge.2, %for.inc21.then ], [ 4, %for.body2.split2 ], [ 4, %for.body2.split ]
+ %storemerge.5 = phi i32 [ 0, %for.inc21.if ], [ %storemerge.3, %for.inc21.then ], [ 0, %for.body2.split2 ], [ 0, %for.body2.split ]
+ %dec = add nsw i64 %storemerge, -1
+ %tobool = icmp eq i64 %dec, 0
+ br i1 %tobool, label %for.inc24, label %for.body2
+
+for.inc24:
+ %storemerge.4.lcssa = phi i64 [ %storemerge.4, %for.inc21 ]
+ %storemerge.5.lcssa = phi i32 [ %storemerge.5, %for.inc21 ]
+ %inc25 = add nuw nsw i64 %storemerge82, 1
+ %exitcond = icmp ne i64 %inc25, 5
+ br i1 %exitcond, label %for.body, label %for.end26
+
+for.end26:
+ %dec.lcssa.lcssa = phi i64 [ 0, %for.inc24 ]
+ %storemerge.4.lcssa.lcssa = phi i64 [ %storemerge.4.lcssa, %for.inc24 ]
+ %storemerge.5.lcssa.lcssa = phi i32 [ %storemerge.5.lcssa, %for.inc24 ]
+ store i64 %dec.lcssa.lcssa, i64* getelementptr inbounds (%struct.a, %struct.a* @g, i64 0, i32 0), align 8
+ ret i16 0
+}
+
+
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"short", !7, i64 0}