[AArch64] Combine callee-save and local stack SP adjustment instructions.

Summary:
If a function needs to allocate both callee-save stack memory and local
stack memory, we currently decrement/increment the SP in two steps:
first for the callee-save area, and then for the local stack area.  This
changes the code to allocate them both at once at the very beginning/end
of the function.  This has two benefits:

1) there is one fewer sub/add micro-op in the prologue/epilogue

2) the stack adjustment instructions act as a scheduling barrier, so
moving them to the very beginning/end of the function increases post-RA
scheduler's ability to move instructions (that only depend on argument
registers) before any of the callee-save stores

This change can cause an increase in instructions if the original local
stack SP decrement could be folded into the first store to the stack.
This occurs when the first local stack store is to stack offset 0.  In
this case we are trading off one more sub instruction for one fewer sub
micro-op (along with benefits (2) and (3) above).

Reviewers: t.p.northover

Subscribers: aemerson, rengolin, mcrosier, llvm-commits

Differential Revision: http://reviews.llvm.org/D18619

llvm-svn: 268746
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 73b6801..9429e87 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -98,8 +98,8 @@
 ; CHECK-LABEL: novla_nodynamicrealign_call
 ; CHECK: .cfi_startproc
 ;   Check that used callee-saved registers are saved
-; CHECK: stp	x19, x30, [sp, #-16]!
-; CHECK: sub	sp, sp, #16
+; CHECK: sub	sp, sp, #32
+; CHECK: stp	x19, x30, [sp, #16]
 ;   Check correctness of cfi pseudo-instructions
 ; CHECK: .cfi_def_cfa_offset 32
 ; CHECK: .cfi_offset w30, -8
@@ -110,17 +110,18 @@
 ;   Check correct access to local variable on the stack, through stack pointer
 ; CHECK: ldr	w[[ILOC:[0-9]+]], [sp, #12]
 ;   Check epilogue:
-; CHECK: ldp	x19, x30, [sp], #16
+; CHECK: ldp	x19, x30, [sp, #16]
 ; CHECK: ret
 ; CHECK: .cfi_endproc
 
 ; CHECK-MACHO-LABEL: _novla_nodynamicrealign_call:
 ; CHECK-MACHO: .cfi_startproc
 ;   Check that used callee-saved registers are saved
-; CHECK-MACHO: stp	x20, x19, [sp, #-32]!
+; CHECK-MACHO: sub	sp, sp, #48
+; CHECK-MACHO: stp	x20, x19, [sp, #16]
 ;   Check that the frame pointer is created:
-; CHECK-MACHO: stp	x29, x30, [sp, #16]
-; CHECK-MACHO: add	x29, sp, #16
+; CHECK-MACHO: stp	x29, x30, [sp, #32]
+; CHECK-MACHO: add	x29, sp, #32
 ;   Check correctness of cfi pseudo-instructions
 ; CHECK-MACHO: .cfi_def_cfa w29, 16
 ; CHECK-MACHO: .cfi_offset w30, -8
@@ -133,8 +134,8 @@
 ;   Check correct access to local variable on the stack, through stack pointer
 ; CHECK-MACHO: ldr	w[[ILOC:[0-9]+]], [sp, #12]
 ;   Check epilogue:
-; CHECK-MACHO: ldp	x29, x30, [sp, #16]
-; CHECK-MACHO: ldp	x20, x19, [sp], #32
+; CHECK-MACHO: ldp	x29, x30, [sp, #32]
+; CHECK-MACHO: ldp	x20, x19, [sp, #16]
 ; CHECK-MACHO: ret
 ; CHECK-MACHO: .cfi_endproc
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-aapcs-be.ll b/llvm/test/CodeGen/AArch64/arm64-aapcs-be.ll
index e77952e..74b6ae1 100644
--- a/llvm/test/CodeGen/AArch64/arm64-aapcs-be.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-aapcs-be.ll
@@ -32,7 +32,8 @@
 
 define void @test_block_addr_callee() {
 ; CHECK-LABEL: test_block_addr_callee:
-; CHECK: str {{[a-z0-9]+}}, [sp, #-16]!
+; CHECK: sub sp, sp, #32
+; CHECK: str {{[a-z0-9]+}}, [sp, #16]
 ; CHECK: bl test_block_addr
   %val = insertvalue [1 x float] undef, float 0.0, 0
   call float @test_block_addr([8 x float] undef, [1 x float] %val)
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi.ll b/llvm/test/CodeGen/AArch64/arm64-abi.ll
index 36a6822..fb52b1d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi.ll
@@ -130,7 +130,7 @@
 ; CHECK-LABEL: test3
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
 ; FAST-LABEL: test3
-; FAST: sub sp, sp, #32
+; FAST: sub sp, sp, #48
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
   %0 = load <2 x i32>, <2 x i32>* %in, align 8
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
index a341cb6..cfe47dd 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -291,7 +291,7 @@
 ; Space for s2 is allocated at sp
 
 ; FAST-LABEL: caller42
-; FAST: sub sp, sp, #96
+; FAST: sub sp, sp, #112
 ; Space for s1 is allocated at fp-24 = sp+72
 ; Space for s2 is allocated at sp+48
 ; FAST: sub x[[A:[0-9]+]], x29, #24
@@ -317,8 +317,8 @@
 define i32 @caller42_stack() #3 {
 entry:
 ; CHECK-LABEL: caller42_stack
-; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #96
+; CHECK: sub sp, sp, #112
+; CHECK: add x29, sp, #96
 ; CHECK: stur {{x[0-9]+}}, [x29, #-16]
 ; CHECK: stur {{q[0-9]+}}, [x29, #-32]
 ; CHECK: str {{x[0-9]+}}, [sp, #48]
@@ -399,7 +399,7 @@
 ; Space for s2 is allocated at sp
 
 ; FAST-LABEL: caller43
-; FAST: mov x29, sp
+; FAST: add x29, sp, #64
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 ; FAST: add x1, sp, #32
@@ -429,8 +429,8 @@
 define i32 @caller43_stack() #3 {
 entry:
 ; CHECK-LABEL: caller43_stack
-; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #96
+; CHECK: sub sp, sp, #112
+; CHECK: add x29, sp, #96
 ; CHECK: stur {{q[0-9]+}}, [x29, #-16]
 ; CHECK: stur {{q[0-9]+}}, [x29, #-32]
 ; CHECK: str {{q[0-9]+}}, [sp, #48]
@@ -446,7 +446,7 @@
 ; CHECK: str w[[C]], [sp]
 
 ; FAST-LABEL: caller43_stack
-; FAST: sub sp, sp, #96
+; FAST: sub sp, sp, #112
 ; Space for s1 is allocated at fp-32 = sp+64
 ; Space for s2 is allocated at sp+32
 ; FAST: sub x[[A:[0-9]+]], x29, #32
@@ -508,7 +508,7 @@
 ; "i64 %0" should be in register x7.
 ; "i32 8" should be on stack at [sp].
 ; CHECK: ldr x7, [{{x[0-9]+}}]
-; CHECK: str {{w[0-9]+}}, [sp, #-16]!
+; CHECK: str {{w[0-9]+}}, [sp]
 ; FAST-LABEL: i64_split
 ; FAST: ldr x7, [{{x[0-9]+}}]
 ; FAST: mov x[[R0:[0-9]+]], sp
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
index 1fd46f3..bdc24ae 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -14,7 +14,7 @@
 define void @main() nounwind {
 entry:
 ; CHECK: main
-; CHECK: mov x29, sp
+; CHECK: add x29, sp, #16
 ; CHECK: mov [[REG:x[0-9]+]], sp
 ; CHECK-NEXT: add x0, [[REG]], #8
   %E = alloca %struct.S2Ty, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-hello.ll b/llvm/test/CodeGen/AArch64/arm64-hello.ll
index 202b69c..43fb2e0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-hello.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-hello.ll
@@ -2,26 +2,26 @@
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX
 
 ; CHECK-LABEL: main:
-; CHECK:	stp	x29, x30, [sp, #-16]!
-; CHECK-NEXT:	mov	x29, sp
-; CHECK-NEXT:	sub	sp, sp, #16
+; CHECK:	sub	sp, sp, #32
+; CHECK-NEXT:	stp	x29, x30, [sp, #16]
+; CHECK-NEXT:	add	x29, sp, #16
 ; CHECK-NEXT:	stur	wzr, [x29, #-4]
 ; CHECK:	adrp	x0, L_.str@PAGE
 ; CHECK:	add	x0, x0, L_.str@PAGEOFF
 ; CHECK-NEXT:	bl	_puts
-; CHECK-NEXT:	add	sp, sp, #16
-; CHECK-NEXT:	ldp	x29, x30, [sp], #16
+; CHECK-NEXT:	ldp	x29, x30, [sp, #16]
+; CHECK-NEXT:	add	sp, sp, #32
 ; CHECK-NEXT:	ret
 
 ; CHECK-LINUX-LABEL: main:
-; CHECK-LINUX:	str	x30, [sp, #-16]!
-; CHECK-LINUX-NEXT:	sub	sp, sp, #16
+; CHECK-LINUX:	sub	sp, sp, #32
+; CHECK-LINUX-NEXT:	str	x30, [sp, #16]
 ; CHECK-LINUX-NEXT:	str	wzr, [sp, #12]
 ; CHECK-LINUX:	adrp	x0, .L.str
 ; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
 ; CHECK-LINUX-NEXT:	bl	puts
-; CHECK-LINUX-NEXT:	add	sp, sp, #16
-; CHECK-LINUX-NEXT:	ldr	x30, [sp], #16
+; CHECK-LINUX-NEXT:	ldr	x30, [sp, #16]
+; CHECK-LINUX-NEXT:	add	sp, sp, #32
 ; CHECK-LINUX-NEXT:	ret
 
 @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
diff --git a/llvm/test/CodeGen/AArch64/arm64-join-reserved.ll b/llvm/test/CodeGen/AArch64/arm64-join-reserved.ll
index c65cf95..dee0344 100644
--- a/llvm/test/CodeGen/AArch64/arm64-join-reserved.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -5,7 +5,7 @@
 ; A move isn't necessary.
 ; <rdar://problem/11492712>
 ; CHECK-LABEL: g:
-; CHECK: str xzr, [sp, #-16]!
+; CHECK: str xzr, [sp]
 ; CHECK: bl
 ; CHECK: ret
 define void @g() nounwind ssp {
diff --git a/llvm/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/llvm/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
index c2006cc..b8236c5 100644
--- a/llvm/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -7,7 +7,7 @@
 entry:
 ; CHECK-LABEL: jscall_patchpoint_codegen:
 ; CHECK:       Ltmp
-; CHECK:       str x{{.+}}, [sp, #-16]!
+; CHECK:       str x{{.+}}, [sp]
 ; CHECK-NEXT:  mov  x0, x{{.+}}
 ; CHECK:       Ltmp
 ; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
@@ -16,7 +16,7 @@
 ; CHECK-NEXT:  blr x16
 ; FAST-LABEL:  jscall_patchpoint_codegen:
 ; FAST:        Ltmp
-; FAST:        str x{{.+}}, [sp, #-16]!
+; FAST:        str x{{.+}}, [sp]
 ; FAST:        Ltmp
 ; FAST-NEXT:   movz  x16, #0xffff, lsl #32
 ; FAST-NEXT:   movk  x16, #0xdead, lsl #16
@@ -50,7 +50,7 @@
 ; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
 ; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
 ; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
-; FAST-NEXT:   str [[REG1]], [sp, #-32]!
+; FAST-NEXT:   str [[REG1]], [sp]
 ; FAST-NEXT:   str [[REG2]], [sp, #16]
 ; FAST-NEXT:   str [[REG3]], [sp, #24]
 ; FAST:        Ltmp
@@ -90,7 +90,7 @@
 ; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
 ; FAST-NEXT:   orr [[REG4:w[0-9]+]], wzr, #0x8
 ; FAST-NEXT:   movz [[REG5:x[0-9]+]], #0xa
-; FAST-NEXT:   str [[REG1]], [sp, #-64]!
+; FAST-NEXT:   str [[REG1]], [sp]
 ; FAST-NEXT:   str [[REG2]], [sp, #16]
 ; FAST-NEXT:   str [[REG3]], [sp, #24]
 ; FAST-NEXT:   str [[REG4]], [sp, #36]
diff --git a/llvm/test/CodeGen/AArch64/arm64-patchpoint.ll b/llvm/test/CodeGen/AArch64/arm64-patchpoint.ll
index 4486ea5..6412427 100644
--- a/llvm/test/CodeGen/AArch64/arm64-patchpoint.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -26,10 +26,11 @@
 ; as a leaf function.
 ;
 ; CHECK-LABEL: caller_meta_leaf
-; CHECK:       mov x29, sp
-; CHECK-NEXT:  sub sp, sp, #32
+; CHECK:       sub sp, sp, #48
+; CHECK-NEXT:  stp x29, x30, [sp, #32]
+; CHECK-NEXT:  add x29, sp, #32
 ; CHECK:       Ltmp
-; CHECK:       add sp, sp, #32
+; CHECK:       add sp, sp, #48
 ; CHECK:       ret
 
 define void @caller_meta_leaf() {
diff --git a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index 2811f1b..010eb64 100644
--- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -13,9 +13,9 @@
 ; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]]
 ;
 ; Prologue code.
-; CHECK: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #-16]!
-; CHECK-NEXT: mov [[SAVE_SP]], sp
-; CHECK-NEXT: sub sp, sp, #16
+; CHECK: sub sp, sp, #32
+; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16]
+; CHECK-NEXT: add [[SAVE_SP]], sp, #16
 ;
 ; Compare the arguments and jump to exit.
 ; After the prologue is set.
@@ -33,8 +33,8 @@
 ; Without shrink-wrapping, epilogue is in the exit block.
 ; DISABLE: [[EXIT_LABEL]]:
 ; Epilogue code.
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp], #16
+; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp, #16]
+; CHECK-NEXT: add sp, sp, #32
 ;
 ; With shrink-wrapping, exit block is a simple return.
 ; ENABLE: [[EXIT_LABEL]]:
@@ -454,9 +454,9 @@
 ; ENABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
 ;
 ; Prologue code.
-; CHECK: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #-16]!
-; CHECK-NEXT: mov [[NEW_SP:x[0-9]+]], sp
-; CHECK-NEXT: sub sp, sp, #48
+; CHECK: sub sp, sp, #64
+; CHECK-NEXT: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #48]
+; CHECK-NEXT: add [[NEW_SP:x[0-9]+]], sp, #48
 ;
 ; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
 ; Setup of the varags.
@@ -473,8 +473,8 @@
 ; DISABLE: [[IFEND_LABEL]]: ; %if.end
 ;
 ; Epilogue code.
-; CHECK: add sp, sp, #48
-; CHECK-NEXT: ldp [[CSR1]], [[CSR2]], [sp], #16
+; CHECK: ldp [[CSR1]], [[CSR2]], [sp, #48]
+; CHECK-NEXT: add sp, sp, #64
 ; CHECK-NEXT: ret
 ;
 ; ENABLE: [[ELSE_LABEL]]: ; %if.else
diff --git a/llvm/test/CodeGen/AArch64/fastcc.ll b/llvm/test/CodeGen/AArch64/fastcc.ll
index c23d91e..fcc8522 100644
--- a/llvm/test/CodeGen/AArch64/fastcc.ll
+++ b/llvm/test/CodeGen/AArch64/fastcc.ll
@@ -7,13 +7,15 @@
 
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
-; CHECK: mov x29, sp
-; CHECK: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK: sub sp, sp, #48
+; CHECK: add x29, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp]
 
 ; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-TAIL-NEXT: mov x29, sp
-; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL-NEXT: stp x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add x29, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp]
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -42,27 +44,29 @@
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #32
-; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK: ldp     x29, x30, [sp, #32]
+; CHECK-NEXT: add sp, sp, #48
 ; CHECK-NEXT: ret
 
 
-; CHECK-TAIL: add sp, sp, #32
-; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-TAIL: ldp     x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add sp, sp, #48
 ; CHECK-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK: sub sp, sp, #48
+; CHECK: stp x29, x30, [sp, #32]
+; CHECK: add x29, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp]
 
 
 ; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-TAIL: mov x29, sp
-; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #32]
+; CHECK-TAIL: add x29, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp]
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -91,23 +95,22 @@
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #32
-; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-NEXT: ldp     x29, x30, [sp, #32]
+; CHECK: add sp, sp, #48
 ; CHECK-NEXT: ret
 
 
-; CHECK-TAIL: add sp, sp, #32
-; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-TAIL-NEXT: add     sp, sp, #16
+; CHECK-TAIL: ldp     x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add     sp, sp, #64
 ; CHECK-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
-; CHECK: mov x29, sp
+; CHECK: add x29, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: add x29, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -136,13 +139,12 @@
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #32
-; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK: ldp     x29, x30, [sp, #32]
+; CHECK-NEXT: add sp, sp, #48
 ; CHECK-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #32
-; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-TAIL-NEXT: add     sp, sp, #32
+; CHECK-TAIL: ldp     x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add     sp, sp, #80
 ; CHECK-TAIL-NEXT: ret
 }
 
@@ -180,22 +182,21 @@
 ; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
 define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32_leaf_local:
-; CHECK: str     x20, [sp, #-16]!
-; CHECK-NEXT: sub     sp, sp, #16
+; CHECK: sub     sp, sp, #32
+; CHECK-NEXT: str     x20, [sp, #16]
 ; CHECK: nop
 ; CHECK-NEXT: //NO_APP
-; CHECK-NEXT: add     sp, sp, #16
-; CHECK-NEXT: ldr     x20, [sp], #16
+; CHECK-NEXT: ldr     x20, [sp, #16]
+; CHECK-NEXT: add     sp, sp, #32
 ; CHECK-NEXT: ret
 
 ; CHECK-TAIL-LABEL: func_stack32_leaf_local:
-; CHECK-TAIL: str     x20, [sp, #-16]!
-; CHECK-TAIL-NEXT: sub     sp, sp, #16
+; CHECK-TAIL: sub     sp, sp, #32
+; CHECK-TAIL-NEXT: str     x20, [sp, #16]
 ; CHECK-TAIL: nop
 ; CHECK-TAIL-NEXT: //NO_APP
-; CHECK-TAIL-NEXT: add     sp, sp, #16
-; CHECK-TAIL-NEXT: ldr     x20, [sp], #16
-; CHECK-TAIL-NEXT: add     sp, sp, #32
+; CHECK-TAIL-NEXT: ldr     x20, [sp, #16]
+; CHECK-TAIL-NEXT: add     sp, sp, #64
 ; CHECK-TAIL-NEXT: ret
 
 ; CHECK-TAIL-RZ-LABEL: func_stack32_leaf_local:
diff --git a/llvm/test/CodeGen/AArch64/func-calls.ll b/llvm/test/CodeGen/AArch64/func-calls.ll
index 8969dbc..3541736 100644
--- a/llvm/test/CodeGen/AArch64/func-calls.ll
+++ b/llvm/test/CodeGen/AArch64/func-calls.ll
@@ -89,11 +89,11 @@
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
 
-; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16]
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
 ; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
 ; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
 
-; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]!
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
 ; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
 ; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
 
diff --git a/llvm/test/CodeGen/AArch64/tailcall-implicit-sret.ll b/llvm/test/CodeGen/AArch64/tailcall-implicit-sret.ll
index 5d68059..3955877 100644
--- a/llvm/test/CodeGen/AArch64/tailcall-implicit-sret.ll
+++ b/llvm/test/CodeGen/AArch64/tailcall-implicit-sret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -disable-post-ra -asm-verbose=false | FileCheck %s
 ; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"