[CostModel][X86] Improve single src shuffle costs

Add missing SK_PermuteSingleSrc costs for AVX2 targets and earlier, also added some of the simpler SK_PermuteTwoSrc costs to support splitting of SK_PermuteSingleSrc shuffles

llvm-svn: 310632
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8242518..fcacef8 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -838,6 +838,8 @@
     { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
     { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
 
+    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
+    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
     { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
     { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
     { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
@@ -872,7 +874,16 @@
     { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
     { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
     { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
-    { TTI::SK_Alternate, MVT::v32i8,  3 }  // vpand + vpandn + vpor
+    { TTI::SK_Alternate, MVT::v32i8,  3 }, // vpand + vpandn + vpor
+
+    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  3 }, // 2*vperm2f128 + vshufpd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  3 }, // 2*vperm2f128 + vshufpd
+    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  4 }, // 2*vperm2f128 + 2*vshufps
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  4 }, // 2*vperm2f128 + 2*vshufps
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
+                                                  // + 2*por + vinsertf128
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  8 }, // vextractf128 + 4*pshufb
+                                                  // + 2*por + vinsertf128
   };
 
   if (ST->hasAVX())
@@ -899,11 +910,14 @@
     { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
     { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
 
-    { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pshufb + pshufb + por
+    { TTI::SK_Alternate, MVT::v8i16,  3 }, // 2*pshufb + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // 2*pshufb + por
 
     { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
-    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }  // pshufb
+    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
+
+    { TTI::SK_PermuteTwoSrc,    MVT::v8i16, 3 }, // 2*pshufb + por
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 3 }, // 2*pshufb + por
   };
 
   if (ST->hasSSSE3())
@@ -914,13 +928,13 @@
     { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
     { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
     { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
-    { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw  + pshufd
+    { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw + pshufd
     { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
 
     { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
     { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
     { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
-    { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw  + pshufd
+    { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw + pshufd
     { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
                                            // + 2*pshufd + 2*unpck + packus
 
@@ -930,8 +944,17 @@
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
     { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
 
-    { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
-    { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }  // pshufd
+    { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // shufpd
+    { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // pshufd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // pshufd
+    { TTI::SK_PermuteSingleSrc, MVT::v8i16,  5 }, // 2*pshuflw + 2*pshufhw
+                                                  // + pshufd/unpck
+    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
+                                                  // + 2*pshufd + 2*unpck + 2*packus
+
+    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // shufpd
+    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
+    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
   };
 
   if (ST->hasSSE2())
@@ -939,9 +962,11 @@
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE1ShuffleTbl[] = {
-    { TTI::SK_Broadcast, MVT::v4f32,  1 }, // shufps
-    { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
-    { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
+    { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
+    { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
+    { TTI::SK_Alternate,        MVT::v4f32, 2 }, // 2*shufps
+    { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
+    { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
   };
 
   if (ST->hasSSE1())