Use movups to lower memcpy and memset even if it's not fast (like corei7). The theory is it's still faster than a pair of movq / a quad of movl. This will probably hurt older chips like P4 but should run faster on current and future Intel processors. rdar://8817010 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@122955 91177308-0d34-0410-b5e6-96231b3b80d8

commit: 461f1fc359dff438dad25e809499845b10a3d032 [log] [tgz]
author: Evan Cheng <evan.cheng@apple.com> Thu Jan 06 07:58:36 2011 +0000
committer: Evan Cheng <evan.cheng@apple.com> Thu Jan 06 07:58:36 2011 +0000
tree: 143a2a682ffdd84409d6bd1673e22630d42d565e
parent: cce240d26bbf1c2bec9cfff4838d8d807b215586 [diff] [blame]
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
index 1b596b5..8f69b11 100644
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll

@@ -1,8 +1,12 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2   | grep movsd  | count 8
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | FileCheck %s
 
 define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %z) nounwind  {
 entry:
+; CHECK: ccosl:
+; CHECK: movaps
+; CHECK: movaps
+; CHECK: movups
+; CHECK: movups
 	%iz = alloca { x86_fp80, x86_fp80 }		; <{ x86_fp80, x86_fp80 }*> [#uses=3]
 	%tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1		; <x86_fp80*> [#uses=1]
 	%tmp2 = load x86_fp80* %tmp1, align 16		; <x86_fp80> [#uses=1]
commit	461f1fc359dff438dad25e809499845b10a3d032	[log] [tgz]
author	Evan Cheng <evan.cheng@apple.com>	Thu Jan 06 07:58:36 2011 +0000
committer	Evan Cheng <evan.cheng@apple.com>	Thu Jan 06 07:58:36 2011 +0000
tree	143a2a682ffdd84409d6bd1673e22630d42d565e
parent	cce240d26bbf1c2bec9cfff4838d8d807b215586 [diff] [blame]