On recent Intel u-arch's, folding loads into some unary SSE instructions can be non-optimal. To be precise, we should avoid folding loads if the instructions only update part of the destination register, and the non-updated part is not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these instructions breaks the partial register dependency and it can improve performance. e.g. movss (%rdi), %xmm0 cvtss2sd %xmm0, %xmm0 instead of cvtss2sd (%rdi), %xmm0 An alternative method to break dependency is to clear the register first. e.g. xorps %xmm0, %xmm0 cvtss2sd (%rdi), %xmm0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@91672 91177308-0d34-0410-b5e6-96231b3b80d8

commit: d3f27fbcf0c9b406a22491220246dfb42a2e2bc0 [log] [tgz]
author: Evan Cheng <evan.cheng@apple.com> Fri Dec 18 07:40:29 2009 +0000
committer: Evan Cheng <evan.cheng@apple.com> Fri Dec 18 07:40:29 2009 +0000
tree: f7204e84da8877e7b062f05bcb1878a05108b44e
parent: c047a17bf56cf13b83ebdd70a77f6b9c512acd29 [diff] [blame]
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 75cdbad..4db3fdb 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp

@@ -266,6 +266,7 @@
     unsigned Model  = 0;
     DetectFamilyModel(EAX, Family, Model);
     IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
+    BreakSSEDep = IsIntel;
 
     GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
     HasX86_64 = (EDX >> 29) & 0x1;
@@ -286,6 +287,7 @@
   , HasFMA3(false)
   , HasFMA4(false)
   , IsBTMemSlow(false)
+  , BreakSSEDep(false)
   , DarwinVers(0)
   , stackAlignment(8)
   // FIXME: this is a known good value for Yonah. How about others?
commit	d3f27fbcf0c9b406a22491220246dfb42a2e2bc0	[log] [tgz]
author	Evan Cheng <evan.cheng@apple.com>	Fri Dec 18 07:40:29 2009 +0000
committer	Evan Cheng <evan.cheng@apple.com>	Fri Dec 18 07:40:29 2009 +0000
tree	f7204e84da8877e7b062f05bcb1878a05108b44e
parent	c047a17bf56cf13b83ebdd70a77f6b9c512acd29 [diff] [blame]