[ppc] Correctly compute the cost of loading 32/64 bit memory into VSR VSX has instructions lxsiwax/lxsdx that can load 32/64 bit value into VSX register cheaply. That patch makes it known to memory cost model, so the vectorization of the test case in pr30990 is beneficial. Differential Revision: https://reviews.llvm.org/D26713 llvm-svn: 288560

commit: 835de1f3ab2df5ce56eab9e5d9021a26fe04b854 [log] [tgz]
author: Guozhi Wei <carrot@google.com> Sat Dec 03 00:41:43 2016 +0000
committer: Guozhi Wei <carrot@google.com> Sat Dec 03 00:41:43 2016 +0000
tree: b7942e7efd680008e726203f88d8a5e394b670e2
parent: 33f947057dff866c3768835dd111dac6613c31d7 [diff] [blame]
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b617a6c..f778534 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

@@ -360,11 +360,6 @@
 
   int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 
-  // Aligned loads and stores are easy.
-  unsigned SrcBytes = LT.second.getStoreSize();
-  if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
-    return Cost;
-
   bool IsAltivecType = ST->hasAltivec() &&
                        (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
                         LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
@@ -373,6 +368,20 @@
   bool IsQPXType = ST->hasQPX() &&
                    (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
 
+  // VSX has 32b/64b load instructions. Legalization can handle loading of
+  // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
+  // PPCTargetLowering can't compute the cost appropriately. So here we
+  // explicitly check this case.
+  unsigned MemBytes = Src->getPrimitiveSizeInBits();
+  if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
+      (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
+    return 1;
+
+  // Aligned loads and stores are easy.
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
+    return Cost;
+
   // If we can use the permutation-based load sequence, then this is also
   // relatively cheap (not counting loop-invariant instructions): one load plus
   // one permute (the last load in a series has extra cost, but we're
commit	835de1f3ab2df5ce56eab9e5d9021a26fe04b854	[log] [tgz]
author	Guozhi Wei <carrot@google.com>	Sat Dec 03 00:41:43 2016 +0000
committer	Guozhi Wei <carrot@google.com>	Sat Dec 03 00:41:43 2016 +0000
tree	b7942e7efd680008e726203f88d8a5e394b670e2
parent	33f947057dff866c3768835dd111dac6613c31d7 [diff] [blame]