ARM: simplify and extend byval handling

The main issue being fixed here is that APCS targets handling a "byval align N"
parameter with N > 4 were miscounting what objects were where on the stack,
leading to FrameLowering setting the frame pointer incorrectly and clobbering
the stack.

But byval handling had grown over many years, and had multiple layers of cruft
trying to compensate for each other and calculate padding correctly. This only
really needs to be done once, in the HandleByVal function. Elsewhere should
just do what it's told by that call.

I also stripped out unnecessary APCS/AAPCS distinctions (now that Clang emits
byvals with the correct C ABI alignment), which simplified HandleByVal.

rdar://20095672

llvm-svn: 231959
diff --git a/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
index b5bdc1b..b64b1bf 100644
--- a/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
+++ b/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -10,7 +10,9 @@
 ; CHECK-LABEL: test_byval_8_bytes_alignment:
 define void @test_byval_8_bytes_alignment(i32 %i, ...) {
 entry:
-; CHECK: stm     r0, {r1, r2, r3}
+; CHECK: sub       sp, sp, #12
+; CHECK: sub       sp, sp, #4
+; CHECK: stmib     sp, {r1, r2, r3}
   %g = alloca i8*
   %g1 = bitcast i8** %g to i8*
   call void @llvm.va_start(i8* %g1)
diff --git a/llvm/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll b/llvm/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
index 0028eec..ff3b7e1 100644
--- a/llvm/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
+++ b/llvm/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
@@ -10,8 +10,9 @@
 entry:
 
 ; Here we need to only check proper start address of restored %s argument.
-; CHECK:      sub     sp, sp, #16
+; CHECK:      sub     sp, sp, #12
 ; CHECK:      push    {r11, lr}
+; CHECK:      sub     sp, sp, #4
 ; CHECK:      add     r0, sp, #12
 ; CHECK:      stm     r0, {r1, r2, r3}
 ; CHECK:      add     r0, sp, #12
diff --git a/llvm/test/CodeGen/ARM/2013-04-05-Small-ByVal-Structs-PR15293.ll b/llvm/test/CodeGen/ARM/2013-04-05-Small-ByVal-Structs-PR15293.ll
index c5eba7d..c38dd16 100644
--- a/llvm/test/CodeGen/ARM/2013-04-05-Small-ByVal-Structs-PR15293.ll
+++ b/llvm/test/CodeGen/ARM/2013-04-05-Small-ByVal-Structs-PR15293.ll
@@ -2,26 +2,26 @@
 ;RUN: llc -mtriple=arm-linux-gnueabihf < %s | FileCheck %s
 
 ;CHECK-LABEL: foo:
-;CHECK: 	sub	sp, sp, #8
-;CHECK: 	push	{r11, lr}
-;CHECK: 	str	r0, [sp, #12]
-;CHECK: 	add	r0, sp, #12
-;CHECK: 	bl	fooUseParam
-;CHECK: 	pop	{r11, lr}
-;CHECK: 	add	sp, sp, #8
-;CHECK: 	mov	pc, lr
-
-;CHECK-LABEL: foo2:
-;CHECK: 	sub	sp, sp, #8
+;CHECK: 	sub	sp, sp, #16
 ;CHECK: 	push	{r11, lr}
 ;CHECK: 	str	r0, [sp, #8]
 ;CHECK: 	add	r0, sp, #8
-;CHECK: 	str	r2, [sp, #12]
-;CHECK: 	bl	fooUseParam
-;CHECK: 	add	r0, sp, #12
 ;CHECK: 	bl	fooUseParam
 ;CHECK: 	pop	{r11, lr}
-;CHECK: 	add	sp, sp, #8
+;CHECK: 	add	sp, sp, #16
+;CHECK: 	mov	pc, lr
+
+;CHECK-LABEL: foo2:
+;CHECK: 	sub	sp, sp, #16
+;CHECK: 	push	{r11, lr}
+;CHECK: 	str	r0, [sp, #8]
+;CHECK: 	add	r0, sp, #8
+;CHECK: 	str	r2, [sp, #16]
+;CHECK: 	bl	fooUseParam
+;CHECK: 	add	r0, sp, #16
+;CHECK: 	bl	fooUseParam
+;CHECK: 	pop	{r11, lr}
+;CHECK: 	add	sp, sp, #16
 ;CHECK: 	mov	pc, lr
 
 ;CHECK-LABEL: doFoo:
diff --git a/llvm/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll b/llvm/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
index e79a3ba..68b918d 100644
--- a/llvm/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
+++ b/llvm/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
@@ -21,11 +21,12 @@
 		 i32 %p2,          ; --> R3,     NSAA=SP+8 
                  i32 %p3) #0 {     ; --> SP+4,   NSAA=SP+12
 entry:
-  ;CHECK: sub sp, #8
+  ;CHECK: sub sp, #12
   ;CHECK: push.w {r11, lr}
-  ;CHECK: add r0, sp, #8
-  ;CHECK: str r2, [sp, #12]
-  ;CHECK: str r1, [sp, #8]
+  ;CHECK: sub sp, #4
+  ;CHECK: add r0, sp, #12
+  ;CHECK: str r2, [sp, #16]
+  ;CHECK: str r1, [sp, #12]
   ;CHECK: bl  fooUseStruct
   call void @fooUseStruct(%st_t* %p1)
   ret void
diff --git a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
index 7bf03a1..3c20c6b 100644
--- a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
+++ b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
@@ -4,7 +4,7 @@
 %struct.S227 = type { [49 x i32], i32 }
 
 define void @check227(
-                      i32 %b,                              
+                      i32 %b,
                       %struct.S227* byval nocapture %arg0,
                       %struct.S227* %arg1) {
 ; b --> R0
@@ -13,14 +13,16 @@
 
 entry:
 
-;CHECK:  sub   sp, sp, #16
+;CHECK:  sub   sp, sp, #12
 ;CHECK:  push  {r11, lr}
+;CHECK:  sub   sp, sp, #4
 ;CHECK:  add   r0, sp, #12
 ;CHECK:  stm   r0, {r1, r2, r3}
 ;CHECK:  ldr   r0, [sp, #212]
 ;CHECK:  bl    useInt
+;CHECK:  add   sp, sp, #4
 ;CHECK:  pop   {r11, lr}
-;CHECK:  add   sp, sp, #16
+;CHECK:  add   sp, sp, #12
 
   %0 = ptrtoint %struct.S227* %arg1 to i32
   tail call void @useInt(i32 %0)
diff --git a/llvm/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll b/llvm/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll
index 33bfa2f..5b2fc57 100644
--- a/llvm/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll
+++ b/llvm/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll
@@ -13,15 +13,16 @@
 ; c -> sp+0..sp+7
 define void @foo1(i32 %a, %struct12bytes* byval %b, i64 %c) {
 ; CHECK-LABEL: foo1
-; CHECK: sub  sp, sp, #16
+; CHECK: sub  sp, sp, #12
 ; CHECK: push  {r11, lr}
+; CHECK: sub sp, sp, #4
 ; CHECK: add  [[SCRATCH:r[0-9]+]], sp, #12
 ; CHECK: stm  [[SCRATCH]], {r1, r2, r3}
 ; CHECK: ldr  r0, [sp, #24]
 ; CHECK: ldr  r1, [sp, #28]
 ; CHECK: bl  useLong
 ; CHECK: pop  {r11, lr}
-; CHECK: add  sp, sp, #16
+; CHECK: add  sp, sp, #12
 
   call void @useLong(i64 %c)
   ret void
diff --git a/llvm/test/CodeGen/ARM/byval-align.ll b/llvm/test/CodeGen/ARM/byval-align.ll
new file mode 100644
index 0000000..a26b5a7
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/byval-align.ll
@@ -0,0 +1,76 @@
+; RUN: llc -mtriple=thumbv7-apple-ios8.0 %s -o - | FileCheck %s
+
+; This checks that alignments greater than 4 are respected by APCS
+; targets. Mostly here to make sure *some* correct code is created after some
+; simplifying refactoring; at the time of writing there were no actual APCS
+; users of byval alignments > 4, so no real calls for ABI stability.
+
+; "byval align 16" can't fit in any regs with an i8* taking up r0.
+define i32 @test_align16(i8*, [4 x i32]* byval align 16 %b) {
+; CHECK-LABEL: test_align16:
+; CHECK-NOT: sub sp
+; CHECK: push {r4, r7, lr}
+; CHECK: add r7, sp, #4
+
+; CHECK: ldr r0, [r7, #8]
+
+  call void @bar()
+  %valptr = getelementptr [4 x i32], [4 x i32]* %b, i32 0, i32 0
+  %val = load i32, i32* %valptr
+  ret i32 %val
+}
+
+; byval align 8 can, but we used to incorrectly set r7 here (miscalculating the
+; space taken up by arg regs).
+define i32 @test_align8(i8*, [4 x i32]* byval align 8 %b) {
+; CHECK-LABEL: test_align8:
+; CHECK: sub sp, #8
+; CHECK: push {r4, r7, lr}
+; CHECK: add r7, sp, #4
+
+; CHECK-DAG: str r2, [r7, #8]
+; CHECK-DAG: str r3, [r7, #12]
+
+; CHECK: ldr r0, [r7, #8]
+
+  call void @bar()
+  %valptr = getelementptr [4 x i32], [4 x i32]* %b, i32 0, i32 0
+  %val = load i32, i32* %valptr
+  ret i32 %val
+}
+
+; "byval align 32" can't fit in regs no matter what: it would be misaligned
+; unless the incoming stack was deliberately misaligned.
+define i32 @test_align32(i8*, [4 x i32]* byval align 32 %b) {
+; CHECK-LABEL: test_align32:
+; CHECK-NOT: sub sp
+; CHECK: push {r4, r7, lr}
+; CHECK: add r7, sp, #4
+
+; CHECK: ldr r0, [r7, #8]
+
+  call void @bar()
+  %valptr = getelementptr [4 x i32], [4 x i32]* %b, i32 0, i32 0
+  %val = load i32, i32* %valptr
+  ret i32 %val
+}
+
+; When passing an object "byval align N", the stack must be at least N-aligned.
+define void @test_call_align16() {
+; CHECK-LABEL: test_call_align16:
+; CHECK: push {r4, r7, lr}
+; CHECK: add r7, sp, #4
+
+; CHECK: mov [[TMP:r[0-9]+]], sp
+; CHECK: bfc [[TMP]], #0, #4
+; CHECK: mov sp, [[TMP]]
+
+; While we're here, make sure the caller also puts it at sp
+  ; CHECK: mov r[[BASE:[0-9]+]], sp
+  ; CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
+  call i32 @test_align16(i8* null, [4 x i32]* byval align 16 @var)
+  ret void
+}
+
+@var = global [4 x i32] zeroinitializer
+declare void @bar()
diff --git a/llvm/test/CodeGen/ARM/debug-frame-vararg.ll b/llvm/test/CodeGen/ARM/debug-frame-vararg.ll
index 2608623..4ff3fa0 100644
--- a/llvm/test/CodeGen/ARM/debug-frame-vararg.ll
+++ b/llvm/test/CodeGen/ARM/debug-frame-vararg.ll
@@ -62,51 +62,51 @@
 
 ; CHECK-FP-LABEL: sum
 ; CHECK-FP: .cfi_startproc
-; CHECK-FP: sub    sp, sp, #16
-; CHECK-FP: .cfi_def_cfa_offset 16
+; CHECK-FP: sub    sp, sp, #12
+; CHECK-FP: .cfi_def_cfa_offset 12
 ; CHECK-FP: push   {r4, lr}
+; CHECK-FP: .cfi_def_cfa_offset 20
+; CHECK-FP: .cfi_offset lr, -16
+; CHECK-FP: .cfi_offset r4, -20
+; CHECK-FP: sub    sp, sp, #4
 ; CHECK-FP: .cfi_def_cfa_offset 24
-; CHECK-FP: .cfi_offset lr, -20
-; CHECK-FP: .cfi_offset r4, -24
-; CHECK-FP: sub    sp, sp, #8
-; CHECK-FP: .cfi_def_cfa_offset 32
 
 ; CHECK-FP-ELIM-LABEL: sum
 ; CHECK-FP-ELIM: .cfi_startproc
-; CHECK-FP-ELIM: sub    sp, sp, #16
-; CHECK-FP-ELIM: .cfi_def_cfa_offset 16
+; CHECK-FP-ELIM: sub    sp, sp, #12
+; CHECK-FP-ELIM: .cfi_def_cfa_offset 12
 ; CHECK-FP-ELIM: push   {r4, r10, r11, lr}
-; CHECK-FP-ELIM: .cfi_def_cfa_offset 32
-; CHECK-FP-ELIM: .cfi_offset lr, -20
-; CHECK-FP-ELIM: .cfi_offset r11, -24
-; CHECK-FP-ELIM: .cfi_offset r10, -28
-; CHECK-FP-ELIM: .cfi_offset r4, -32
+; CHECK-FP-ELIM: .cfi_def_cfa_offset 28
+; CHECK-FP-ELIM: .cfi_offset lr, -16
+; CHECK-FP-ELIM: .cfi_offset r11, -20
+; CHECK-FP-ELIM: .cfi_offset r10, -24
+; CHECK-FP-ELIM: .cfi_offset r4, -28
 ; CHECK-FP-ELIM: add    r11, sp, #8
-; CHECK-FP-ELIM: .cfi_def_cfa r11, 24
+; CHECK-FP-ELIM: .cfi_def_cfa r11, 20
 
 ; CHECK-THUMB-FP-LABEL: sum
 ; CHECK-THUMB-FP: .cfi_startproc
-; CHECK-THUMB-FP: sub    sp, #16
-; CHECK-THUMB-FP: .cfi_def_cfa_offset 16
+; CHECK-THUMB-FP: sub    sp, #12
+; CHECK-THUMB-FP: .cfi_def_cfa_offset 12
 ; CHECK-THUMB-FP: push   {r4, lr}
+; CHECK-THUMB-FP: .cfi_def_cfa_offset 20
+; CHECK-THUMB-FP: .cfi_offset lr, -16
+; CHECK-THUMB-FP: .cfi_offset r4, -20
+; CHECK-THUMB-FP: sub    sp, #4
 ; CHECK-THUMB-FP: .cfi_def_cfa_offset 24
-; CHECK-THUMB-FP: .cfi_offset lr, -20
-; CHECK-THUMB-FP: .cfi_offset r4, -24
-; CHECK-THUMB-FP: sub    sp, #8
-; CHECK-THUMB-FP: .cfi_def_cfa_offset 32
 
 ; CHECK-THUMB-FP-ELIM-LABEL: sum
 ; CHECK-THUMB-FP-ELIM: .cfi_startproc
-; CHECK-THUMB-FP-ELIM: sub    sp, #16
-; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 16
+; CHECK-THUMB-FP-ELIM: sub    sp, #12
+; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 12
 ; CHECK-THUMB-FP-ELIM: push   {r4, r6, r7, lr}
-; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 32
-; CHECK-THUMB-FP-ELIM: .cfi_offset lr, -20
-; CHECK-THUMB-FP-ELIM: .cfi_offset r7, -24
-; CHECK-THUMB-FP-ELIM: .cfi_offset r6, -28
-; CHECK-THUMB-FP-ELIM: .cfi_offset r4, -32
+; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 28
+; CHECK-THUMB-FP-ELIM: .cfi_offset lr, -16
+; CHECK-THUMB-FP-ELIM: .cfi_offset r7, -20
+; CHECK-THUMB-FP-ELIM: .cfi_offset r6, -24
+; CHECK-THUMB-FP-ELIM: .cfi_offset r4, -28
 ; CHECK-THUMB-FP-ELIM: add    r7, sp, #8
-; CHECK-THUMB-FP-ELIM: .cfi_def_cfa r7, 24
+; CHECK-THUMB-FP-ELIM: .cfi_def_cfa r7, 20
 
 define i32 @sum(i32 %count, ...) {
 entry:
diff --git a/llvm/test/CodeGen/ARM/ssp-data-layout.ll b/llvm/test/CodeGen/ARM/ssp-data-layout.ll
index 516cc2b..d08e7de 100644
--- a/llvm/test/CodeGen/ARM/ssp-data-layout.ll
+++ b/llvm/test/CodeGen/ARM/ssp-data-layout.ll
@@ -161,7 +161,7 @@
   %coerce.dive26 = getelementptr %struct.struct_small_nonchar, %struct.struct_small_nonchar* %d, i32 0, i32 0
   %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
   %8 = load i32, i32* %7, align 1
-  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 4 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
   ret void
 }
 
@@ -308,7 +308,7 @@
   %coerce.dive26 = getelementptr %struct.struct_small_nonchar, %struct.struct_small_nonchar* %d, i32 0, i32 0
   %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
   %8 = load i32, i32* %7, align 1
-  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 4 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
   ret void
 }
 
@@ -443,7 +443,7 @@
   %coerce.dive26 = getelementptr %struct.struct_small_nonchar, %struct.struct_small_nonchar* %d, i32 0, i32 0
   %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
   %8 = load i32, i32* %7, align 1
-  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 4 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
   ret void
 }
 
@@ -482,7 +482,7 @@
   %coerce.dive5 = getelementptr %struct.struct_small_nonchar, %struct.struct_small_nonchar* %d2, i32 0, i32 0
   %5 = bitcast [2 x i16]* %coerce.dive5 to i32*
   %6 = load i32, i32* %5, align 1
-  call void @takes_all(i64 %2, i16 %4, %struct.struct_large_nonchar* byval align 8 %d1, i32 %6, i8* null, i8* null, i32* null, i16* null, i32* null, i32 0, i32 0, i32 0)
+  call void @takes_all(i64 %2, i16 %4, %struct.struct_large_nonchar* byval align 4 %d1, i32 %6, i8* null, i8* null, i32* null, i16* null, i32* null, i32 0, i32 0, i32 0)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll b/llvm/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
index 148a79d..4879d73 100644
--- a/llvm/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
+++ b/llvm/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
@@ -21,10 +21,10 @@
 ; CHECK-LABEL: varargs_func:
 ; Reserve space for the varargs save area.  This currently reserves
 ; more than enough (16 bytes rather than the 12 bytes needed).
-; CHECK: sub sp, sp, #16
+; CHECK: sub sp, sp, #12
 ; CHECK: push {r11, lr}
 ; Align the stack pointer to a multiple of 16.
-; CHECK: sub sp, sp, #8
+; CHECK: sub sp, sp, #12
 ; Calculate the address of the varargs save area and save varargs
 ; arguments into it.
 ; CHECK-NEXT: add r0, sp, #20
diff --git a/llvm/test/CodeGen/ARM/vargs_align.ll b/llvm/test/CodeGen/ARM/vargs_align.ll
index d19abd5..6dc7135 100644
--- a/llvm/test/CodeGen/ARM/vargs_align.ll
+++ b/llvm/test/CodeGen/ARM/vargs_align.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=EABI
+; RUN: llc < %s -mtriple=armv7-linux-gnueabihf | FileCheck %s -check-prefix=EABI
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnu | FileCheck %s -check-prefix=OABI
 
 define i32 @f(i32 %a, ...) {
@@ -11,13 +11,17 @@
 	%tmp1 = load i32, i32* %tmp		; <i32> [#uses=1]
 	store i32 %tmp1, i32* %retval
 	call void @llvm.va_start(i8* null)
+	call void asm sideeffect "", "~{d8}"()
 	br label %return
 
 return:		; preds = %entry
 	%retval2 = load i32, i32* %retval		; <i32> [#uses=1]
 	ret i32 %retval2
-; EABI: add sp, sp, #12
 ; EABI: add sp, sp, #16
+; EABI: vpop {d8}
+; EABI: add sp, sp, #4
+; EABI: add sp, sp, #12
+
 ; OABI: add sp, sp, #12
 ; OABI: add sp, sp, #12
 }