Re-apply r110655 with fixes. Epilogue must restore sp from fp if the function stack frame has a var-sized object.

Also added a test case to check for the added benefit of this patch: it's optimizing away the unnecessary restore of sp from fp for some non-leaf functions.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@110707 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/Thumb/large-stack.ll b/test/CodeGen/Thumb/large-stack.ll
index 02de36a..b05e6bf 100644
--- a/test/CodeGen/Thumb/large-stack.ll
+++ b/test/CodeGen/Thumb/large-stack.ll
@@ -1,20 +1,35 @@
-; RUN: llc < %s -march=thumb | grep {ldr.*LCP} | count 5
+; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
 
 define void @test1() {
+; CHECK: test1:
+; CHECK: sub sp, #256
+; CHECK: add sp, #256
     %tmp = alloca [ 64 x i32 ] , align 4
     ret void
 }
 
 define void @test2() {
+; CHECK: test2:
+; CHECK: ldr r0, LCPI
+; CHECK: add sp, r0
+; CHECK: mov sp, r7
+; CHECK: sub sp, #4
     %tmp = alloca [ 4168 x i8 ] , align 4
     ret void
 }
 
 define i32 @test3() {
-	%retval = alloca i32, align 4
-	%tmp = alloca i32, align 4
-	%a = alloca [805306369 x i8], align 16
-	store i32 0, i32* %tmp
-	%tmp1 = load i32* %tmp
-        ret i32 %tmp1
+; CHECK: test3:
+; CHECK: ldr r1, LCPI
+; CHECK: add sp, r1
+; CHECK: ldr r1, LCPI
+; CHECK: add r1, sp
+; CHECK: mov sp, r7
+; CHECK: sub sp, #4
+    %retval = alloca i32, align 4
+    %tmp = alloca i32, align 4
+    %a = alloca [805306369 x i8], align 16
+    store i32 0, i32* %tmp
+    %tmp1 = load i32* %tmp
+    ret i32 %tmp1
 }
diff --git a/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll b/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
new file mode 100644
index 0000000..abcf13a
--- /dev/null
+++ b/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -O3 | FileCheck %s
+
+@.str = private constant [4 x i8] c"%d\0A\00", align 4 ; <[4 x i8]*> [#uses=1]
+
+define internal fastcc i32 @Callee(i32 %i) nounwind {
+entry:
+; CHECK: Callee:
+  %0 = icmp eq i32 %i, 0                          ; <i1> [#uses=1]
+  br i1 %0, label %bb2, label %bb
+
+bb:                                               ; preds = %entry
+  %1 = alloca [1000 x i8], align 4                ; <[1000 x i8]*> [#uses=1]
+  %.sub = getelementptr inbounds [1000 x i8]* %1, i32 0, i32 0 ; <i8*> [#uses=2]
+  %2 = call i32 (i8*, i32, i32, i8*, ...)* @__sprintf_chk(i8* %.sub, i32 0, i32 1000, i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %i) nounwind ; <i32> [#uses=0]
+  %3 = load i8* %.sub, align 4                    ; <i8> [#uses=1]
+  %4 = sext i8 %3 to i32                          ; <i32> [#uses=1]
+  ret i32 %4
+
+bb2:                                              ; preds = %entry
+; Must restore sp from fp here
+; CHECK: mov sp, r7
+; CHECK: sub sp, #8
+; CHECK: pop
+  ret i32 0
+}
+
+declare i32 @__sprintf_chk(i8*, i32, i32, i8*, ...) nounwind
+
+define i32 @main() nounwind {
+; CHECK: main:
+bb.nph:
+  br label %bb
+
+bb:                                               ; preds = %bb, %bb.nph
+  %0 = phi i32 [ 0, %bb.nph ], [ %3, %bb ]        ; <i32> [#uses=2]
+  %j.01 = phi i32 [ 0, %bb.nph ], [ %2, %bb ]     ; <i32> [#uses=1]
+  %1 = tail call fastcc i32 @Callee(i32 %0) nounwind ; <i32> [#uses=1]
+  %2 = add nsw i32 %1, %j.01                      ; <i32> [#uses=2]
+  %3 = add nsw i32 %0, 1                          ; <i32> [#uses=2]
+  %exitcond = icmp eq i32 %3, 10000               ; <i1> [#uses=1]
+  br i1 %exitcond, label %bb2, label %bb
+
+bb2:                                              ; preds = %bb
+; No need to restore sp from fp here.
+; CHECK: printf
+; CHECK-NOT: mov sp, r7
+; CHECK-NOT: sub sp, #12
+; CHECK: pop
+  %4 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %2) nounwind ; <i32> [#uses=0]
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind