[X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128 (reapplied)

As reported on PR26235, we don't currently make use of the VBROADCASTF128/VBROADCASTI128 instructions (or the AVX512 equivalents) to load+splat a 128-bit vector to both lanes of a 256-bit vector.

This patch enables lowering from subvector insertion/concatenation patterns and auto-upgrades the llvm.x86.avx.vbroadcastf128.pd.256 / llvm.x86.avx.vbroadcastf128.ps.256 intrinsics to match.

We could possibly investigate using VBROADCASTF128/VBROADCASTI128 to load repeated constants as well (similar to how we already do for scalar broadcasts).

Reapplied with fix for PR28657 - removed intrinsic definitions (clang companion patch to be be submitted shortly).

Differential Revision: https://reviews.llvm.org/D22460

llvm-svn: 276416
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 2e4a2f8..a8145b6 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -296,6 +296,7 @@
          Name.startswith("avx.blend.p") ||
          Name == "avx2.pblendw" ||
          Name.startswith("avx2.pblendd.") ||
+         Name.startswith("avx.vbroadcastf128") ||
          Name == "avx2.vbroadcasti128" ||
          Name == "xop.vpcmov" ||
          (Name.startswith("xop.vpcom") && F->arg_size() == 2))) {
@@ -886,7 +887,7 @@
       Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
       Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
       Rep = Builder.CreateZExt(Rep, CI->getType(), "");
-    } else if (IsX86 && Name.startswith("avx.vbroadcast")) {
+    } else if (IsX86 && Name.startswith("avx.vbroadcast.s")) {
       // Replace broadcasts with a series of insertelements.
       Type *VecTy = CI->getType();
       Type *EltTy = VecTy->getVectorElementType();
@@ -918,15 +919,21 @@
       bool DoSext = (StringRef::npos != Name.find("pmovsx"));
       Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
                    : Builder.CreateZExt(SV, DstTy);
-    } else if (IsX86 && Name == "avx2.vbroadcasti128") {
-      // Replace vbroadcasts with a vector shuffle.
-      Type *VT = VectorType::get(Type::getInt64Ty(C), 2);
+    } else if (IsX86 && (Name.startswith("avx.vbroadcastf128") ||
+                         Name == "avx2.vbroadcasti128")) {
+      // Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
+      Type *EltTy = CI->getType()->getVectorElementType();
+      unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits();
+      Type *VT = VectorType::get(EltTy, NumSrcElts);
       Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
                                             PointerType::getUnqual(VT));
       Value *Load = Builder.CreateLoad(VT, Op);
-      uint32_t Idxs[4] = { 0, 1, 0, 1 };
-      Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
-                                        Idxs);
+      if (NumSrcElts == 2)
+        Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
+                                          { 0, 1, 0, 1 });
+      else
+        Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
+                                          { 0, 1, 2, 3, 0, 1, 2, 3 });
     } else if (IsX86 && (Name.startswith("avx2.pbroadcast") ||
                          Name.startswith("avx2.vbroadcast") ||
                          Name.startswith("avx512.pbroadcast") ||