implement rdar://6653118 - fastisel should fold loads where possible.
Since mem2reg isn't run at -O0, we get a ton of reloads from the stack,
for example, before, this code:
int foo(int x, int y, int z) {
return x+y+z;
}
used to compile into:
_foo: ## @foo
subq $12, %rsp
movl %edi, 8(%rsp)
movl %esi, 4(%rsp)
movl %edx, (%rsp)
movl 8(%rsp), %edx
movl 4(%rsp), %esi
addl %edx, %esi
movl (%rsp), %edx
addl %esi, %edx
movl %edx, %eax
addq $12, %rsp
ret
Now we produce:
_foo: ## @foo
subq $12, %rsp
movl %edi, 8(%rsp)
movl %esi, 4(%rsp)
movl %edx, (%rsp)
movl 8(%rsp), %edx
addl 4(%rsp), %edx ## Folded load
addl (%rsp), %edx ## Folded load
movl %edx, %eax
addq $12, %rsp
ret
Fewer instructions and less register use = faster compiles.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@113102 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index 35ec1e7..8db1936 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll
@@ -1,10 +1,8 @@
-; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin | \
-; RUN: grep lazy_ptr, | count 2
-; RUN: llc < %s -fast-isel -march=x86 -relocation-model=static | \
-; RUN: grep lea
+; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin | FileCheck %s
@src = external global i32
+; rdar://6653118
define i32 @loadgv() nounwind {
entry:
%0 = load i32* @src, align 4
@@ -12,6 +10,14 @@
%2 = add i32 %0, %1
store i32 %2, i32* @src
ret i32 %2
+; This should fold one of the loads into the add.
+; CHECK: loadgv:
+; CHECK: movl L_src$non_lazy_ptr, %ecx
+; CHECK: movl (%ecx), %eax
+; CHECK: addl (%ecx), %eax
+; CHECK: movl %eax, (%ecx)
+; CHECK: ret
+
}
%stuff = type { i32 (...)** }
@@ -21,4 +27,8 @@
entry:
store i32 (...)** getelementptr ([4 x i32 (...)*]* @LotsStuff, i32 0, i32 2), i32 (...)*** null, align 4
ret void
+; CHECK: _t:
+; CHECK: movl $0, %eax
+; CHECK: movl L_LotsStuff$non_lazy_ptr, %ecx
+
}