AVX-512 Loop Vectorizer: Cost calculation for interleave load/store patterns.

X86 target does not provide any target specific cost calculation for interleave patterns.It uses the common target-independent calculation, which gives very high numbers. As a result, the scalar version is chosen in many cases. The situation on AVX-512 is even worse, since we have 3-src shuffles that significantly reduce the cost.

In this patch I calculate the cost on AVX-512. It will allow to compare interleave pattern with gather/scatter and choose a better solution (PR31426).

* Shiffle-broadcast cost will be changed in Simon's upcoming patch.

Differential Revision: https://reviews.llvm.org/D28118

llvm-svn: 290810
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index db563c0..5c02130 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -598,9 +598,6 @@
 
 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  // We only estimate the cost of reverse and alternate shuffles.
-  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
   if (Kind == TTI::SK_Reverse) {
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
@@ -700,9 +697,8 @@
       if (const auto *Entry =
               CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
         return LT.first * Entry->Cost;
-  }
 
-  if (Kind == TTI::SK_Alternate) {
+  } else if (Kind == TTI::SK_Alternate) {
     // 64-bit packed float vectors (v2f32) are widened to type v4f32.
     // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
@@ -792,7 +788,132 @@
     if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
                                             ISD::VECTOR_SHUFFLE, LT.second))
       return LT.first * Entry->Cost;
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  } else if (Kind == TTI::SK_PermuteTwoSrc) {
+    // We assume that source and destination have the same vector type.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    int NumOfDests = LT.first;
+    int NumOfShufflesPerDest = LT.first * 2 - 1;
+    int NumOfShuffles = NumOfDests * NumOfShufflesPerDest;
+
+    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b
+        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}  // vpermt2b
+    };
+
+    if (ST->hasVBMI())
+      if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+    static const CostTblEntry AVX512BWShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3},  // zext + vpermt2w + trunc
+        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}   // zext + vpermt2w + trunc
+    };
+
+    if (ST->hasBWI())
+      if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+    static const CostTblEntry AVX512ShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d
+        {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermt2d
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}   // vpermt2d
+    };
+
+    if (ST->hasAVX512())
+      if (const auto *Entry =
+              CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+  } else if (Kind == TTI::SK_PermuteSingleSrc) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    if (LT.first == 1) {
+
+      static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb
+          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}  // vpermb
+      };
+
+      if (ST->hasVBMI())
+        if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+                                                ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+      static const CostTblEntry AVX512BWShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8},  // extend to v32i16
+          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}   // vpermw + zext/trunc
+      };
+
+      if (ST->hasBWI())
+        if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+                                                ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+      static const CostTblEntry AVX512ShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},  // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}   // pshufb
+      };
+
+      if (ST->hasAVX512())
+        if (const auto *Entry =
+            CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+    } else {
+      // We are going to permute multiple sources and the result will be in
+      // multiple destinations. Providing an accurate cost only for splits where
+      // the element type remains the same.
+
+      MVT LegalVT = LT.second;
+      if (LegalVT.getVectorElementType().getSizeInBits() ==
+              Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+          LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+        unsigned VecTySize = DL.getTypeStoreSize(Tp);
+        unsigned LegalVTSize = LegalVT.getStoreSize();
+        // Number of source vectors after legalization:
+        unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+        // Number of destination vectors after legalization:
+        unsigned NumOfDests = LT.first;
+
+        Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+                                           LegalVT.getVectorNumElements());
+
+        unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+        return NumOfShuffles *
+               getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+      }
+    }
   }
 
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@@ -1942,13 +2063,14 @@
   // Vector-4 of gather/scatter instruction does not exist on KNL.
   // We can extend it to 8 elements, but zeroing upper bits of
   // the mask vector will add more instructions. Right now we give the scalar
-  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
-  // better in the VariableMask case.
+  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
+  // is better in the VariableMask case.
   if (VF == 2 || (VF == 4 && !ST->hasVLX()))
     Scalarize = true;
 
   if (Scalarize)
-    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
+                           AddressSpace);
 
   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
 }
@@ -2013,3 +2135,115 @@
   // As a temporary solution, disable on Atom.
   return !(ST->isAtom() || ST->isSLM());
 }
+
+// Get estimation for interleaved load/store operations and strided load.
+// \p Indices contains indices for strided load.
+// \p Factor - the factor of interleaving.
+// AVX-512 provides 3-src shuffles that significantly reduces the cost.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+                                                 unsigned Factor,
+                                                 ArrayRef<unsigned> Indices,
+                                                 unsigned Alignment,
+                                                 unsigned AddressSpace) {
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+
+  // Calculate the number of memory operations (NumOfMemOps), required
+  // for load/store the VecTy.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+  unsigned LegalVTSize = LegalVT.getStoreSize();
+  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+  // Get the cost of one memory operation.
+  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+                                        LegalVT.getVectorNumElements());
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+  if (Opcode == Instruction::Load) {
+    // Kind of shuffle depends on number of loaded values.
+    // If we load the entire data in one register, we can use a 1-src shuffle.
+    // Otherwise, we'll merge 2 sources in each operation.
+    TTI::ShuffleKind ShuffleKind =
+        (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
+
+    unsigned ShuffleCost =
+        getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+
+    unsigned NumOfLoadsInInterleaveGrp =
+        Indices.size() ? Indices.size() : Factor;
+    Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
+                                     VecTy->getVectorNumElements() / Factor);
+    unsigned NumOfResults =
+        getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
+        NumOfLoadsInInterleaveGrp;
+
+    // About a half of the loads may be folded in shuffles when we have only
+    // one result. If we have more than one result, we do not fold loads at all.
+    unsigned NumOfUnfoldedLoads =
+        NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+
+    // Get a number of shuffle operations per result.
+    unsigned NumOfShufflesPerResult =
+        std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
+
+    // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+    // When we have more than one destination, we need additional instructions
+    // to keep sources.
+    unsigned NumOfMoves = 0;
+    if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
+      NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
+
+    int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+               NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+
+    return Cost;
+  }
+
+  // Store.
+  assert(Opcode == Instruction::Store &&
+         "Expected Store Instruction at this  point");
+
+  // There is no strided stores meanwhile. And store can't be folded in
+  // shuffle.
+  unsigned NumOfSources = Factor; // The number of values to be merged.
+  unsigned ShuffleCost =
+      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+  unsigned NumOfShufflesPerStore = NumOfSources - 1;
+
+  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+  // We need additional instructions to keep sources.
+  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
+  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+             NumOfMoves;
+  return Cost;
+}
+
+int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                           unsigned Factor,
+                                           ArrayRef<unsigned> Indices,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
+  auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
+    RequiresBW = false;
+    Type *EltTy = VecTy->getVectorElementType();
+    if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
+        EltTy->isIntegerTy(32) || EltTy->isPointerTy())
+      return true;
+    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
+      RequiresBW = true;
+      return true;
+    }
+    return false;
+  };
+  bool RequiresBW;
+  bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
+  if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
+    return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
+                                            Alignment, AddressSpace);
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}