Rework loop predication pass

We've found a serious issue with the current implementation of loop predication.
The current implementation relies on SCEV and this turned out to be problematic.
To fix the problem we had to rework the pass substantially. We have had the
reworked implementation in our downstream tree for a while. This is the initial
patch of the series of changes to upstream the new implementation.

For now the transformation is limited to the following case:
  * The loop has a single latch with either ult or slt icmp condition.
  * The step of the IV used in the latch condition is 1.
  * The IV of the latch condition is the same as the post increment IV of the guard condition.
  * The guard condition is ult.

See the review or the LoopPredication.cpp header for the details about the
problem and the new implementation.

Reviewed By: sanjoy, mkazantsev

Differential Revision: https://reviews.llvm.org/D37569

llvm-svn: 313981
diff --git a/llvm/test/Transforms/LoopPredication/nested.ll b/llvm/test/Transforms/LoopPredication/nested.ll
index 6b40cde..796839f 100644
--- a/llvm/test/Transforms/LoopPredication/nested.ll
+++ b/llvm/test/Transforms/LoopPredication/nested.ll
@@ -10,8 +10,6 @@
   br i1 %tmp5, label %exit, label %outer.loop.preheader
 
 outer.loop.preheader:
-; CHECK: outer.loop.preheader:
-; CHECK: [[iteration_count:[^ ]+]] = add i32 %l, -1
   br label %outer.loop
 
 outer.loop:
@@ -22,7 +20,10 @@
   
 inner.loop.preheader:
 ; CHECK: inner.loop.preheader:
-; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[iteration_count]], %length
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp sle i32 %l, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
+; CHECK-NEXT: br label %inner.loop
   br label %inner.loop
 
 inner.loop:
@@ -31,7 +32,7 @@
   %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
   %j = phi i32 [ %j.next, %inner.loop ], [ 0, %inner.loop.preheader ]
 
-  %within.bounds = icmp slt i32 %j, %length
+  %within.bounds = icmp ult i32 %j, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   
   %j.i64 = zext i32 %j to i64
@@ -62,8 +63,10 @@
 
 outer.loop.preheader:
 ; CHECK: outer.loop.preheader:
-; CHECK: [[iteration_count:[^ ]+]] = add i32 %n, -1
-; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[iteration_count]], %length
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp sle i32 %n, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
+; CHECK-NEXT: br label %outer.loop
   br label %outer.loop
 
 outer.loop:
@@ -82,7 +85,7 @@
   %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
   %j = phi i32 [ %j.next, %inner.loop ], [ 0, %inner.loop.preheader ]
 
-  %within.bounds = icmp slt i32 %i, %length
+  %within.bounds = icmp ult i32 %i, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   
   %i.i64 = zext i32 %i to i64
@@ -112,14 +115,15 @@
   br i1 %tmp5, label %exit, label %outer.loop.preheader
 
 outer.loop.preheader:
+; CHECK: outer.loop.preheader:
+; CHECK-NEXT: [[first_iteration_check_outer:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check_outer:[^ ]+]] = icmp sle i32 %n, %length
+; CHECK-NEXT: [[wide_cond_outer:[^ ]+]] = and i1 [[first_iteration_check_outer]], [[limit_check_outer]]
+; CHECK-NEXT: br label %outer.loop
   br label %outer.loop
 
 outer.loop:
 ; CHECK: outer.loop:
-; CHECK: [[i_1:[^ ]+]] = add i32 %i, 1
-; CHECK-NEXT: [[l_sgt_i_1:[^ ]+]] = icmp sgt i32 %l, [[i_1]]
-; CHECK-NEXT: [[smax:[^ ]+]] = select i1 [[l_sgt_i_1]], i32 %l, i32 [[i_1]]
-; CHECK-NEXT: [[max_j:[^ ]+]] = add i32 [[smax]], -1
   %outer.loop.acc = phi i32 [ %outer.loop.acc.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
   %i = phi i32 [ %i.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
   %tmp6 = icmp sle i32 %l, 0
@@ -127,16 +131,69 @@
   
 inner.loop.preheader:
 ; CHECK: inner.loop.preheader:
-; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[max_j]], %length
+; CHECK: [[limit_check_inner:[^ ]+]] = icmp sle i32 %l, %length
+; CHECK: br label %inner.loop
   br label %inner.loop
 
 inner.loop:
 ; CHECK: inner.loop:
+; CHECK: [[wide_cond:[^ ]+]] = and i1 [[limit_check_inner]], [[wide_cond_outer]]
 ; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
   %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
   %j = phi i32 [ %j.next, %inner.loop ], [ %i, %inner.loop.preheader ]
 
-  %within.bounds = icmp slt i32 %j, %length
+  %within.bounds = icmp ult i32 %j, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  
+  %j.i64 = zext i32 %j to i64
+  %array.j.ptr = getelementptr inbounds i32, i32* %array, i64 %j.i64
+  %array.j = load i32, i32* %array.j.ptr, align 4
+  %inner.loop.acc.next = add i32 %inner.loop.acc, %array.j
+
+  %j.next = add nsw i32 %j, 1
+  %inner.continue = icmp slt i32 %j.next, %l
+  br i1 %inner.continue, label %inner.loop, label %outer.loop.inc
+
+outer.loop.inc:
+  %outer.loop.acc.next = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %outer.loop ]
+  %i.next = add nsw i32 %i, 1
+  %outer.continue = icmp slt i32 %i.next, %n
+  br i1 %outer.continue, label %outer.loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %outer.loop.acc.next, %outer.loop.inc ]
+  ret i32 %result
+}
+
+define i32 @cant_expand_guard_check_start(i32* %array, i32 %length, i32 %n, i32 %l, i32 %maybezero) {
+; CHECK-LABEL: @cant_expand_guard_check_start
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %outer.loop.preheader
+
+outer.loop.preheader:
+  br label %outer.loop
+
+outer.loop:
+  %outer.loop.acc = phi i32 [ %outer.loop.acc.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %i = phi i32 [ %i.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %tmp6 = icmp sle i32 %l, 0
+  %div = udiv i32 %i, %maybezero
+  br i1 %tmp6, label %outer.loop.inc, label %inner.loop.preheader
+  
+inner.loop.preheader:
+; CHECK: inner.loop.preheader:
+; CHECK: br label %inner.loop
+  br label %inner.loop
+
+inner.loop:
+; CHECK: inner.loop:
+; CHECK: %within.bounds = icmp ult i32 %j, %length
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
+  %j = phi i32 [ %j.next, %inner.loop ], [ %div, %inner.loop.preheader ]
+
+  %within.bounds = icmp ult i32 %j, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   
   %j.i64 = zext i32 %j to i64