[x86] enable CGP memcmp() expansion for 2/4/8 byte sizes There are a couple of potential improvements as seen in the IR and asm: 1. We're unnecessarily extending to a larger type to compare values. 2. The codegen for (select cond, 1, -1) could avoid a cmov. (or we could change the order of the compares, so we have a select with 0 operand) llvm-svn: 305802

commit: 0656629b870ae9933e350c5f4edc733012e7ece0 [log] [tgz]
author: Sanjay Patel <spatel@rotateright.com> Tue Jun 20 15:58:30 2017 +0000
committer: Sanjay Patel <spatel@rotateright.com> Tue Jun 20 15:58:30 2017 +0000
tree: 82fc84249670cc54145d258342581ecd40b81bec
parent: 4822b5b649f0086aa8339c2def1dbdd303dcb257 [diff]
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index eacf2e5..8dfaf3f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp

@@ -1662,6 +1662,12 @@
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
+
+  // TODO: These control memcmp expansion in CGP and are set low to prevent
+  // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
+  MaxLoadsPerMemcmp = 1;
+  MaxLoadsPerMemcmpOptSize = 1;
+
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 1d58ccc..f13933e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

@@ -2232,6 +2232,12 @@
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  // TODO: We can increase these based on available vector ops.
+  MaxLoadSize = ST->is64Bit() ? 8 : 4;
+  return true;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index b907b75..375fb92 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h

@@ -107,7 +107,7 @@
   bool isLegalMaskedScatter(Type *DataType);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
-
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
commit	0656629b870ae9933e350c5f4edc733012e7ece0	[log] [tgz]
author	Sanjay Patel <spatel@rotateright.com>	Tue Jun 20 15:58:30 2017 +0000
committer	Sanjay Patel <spatel@rotateright.com>	Tue Jun 20 15:58:30 2017 +0000
tree	82fc84249670cc54145d258342581ecd40b81bec
parent	4822b5b649f0086aa8339c2def1dbdd303dcb257 [diff]