- Handle special scalar_to_vector case: splats. Using a native 128-bit
shuffle before inserting on a 256-bit vector.
- Add AVX versions of movd/movq instructions
- Introduce a few COPY patterns to match insert_subvector instructions.
This turns a trivial insert_subvector instruction into a register copy,
coalescing the xmm into a ymm and avoid emiting on more instruction.

llvm-svn: 136002
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0b5ac6a..2f74c0f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3955,6 +3955,34 @@
   return DAG.getNode(ISD::BITCAST, dl, VT, V);
 }
 
+/// PromoteVectorToScalarSplat - Since there's no native support for
+/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector +
+/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the
+/// shuffle before the insertion, this yields less instructions in the end.
+static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV,
+                                          SelectionDAG &DAG) {
+  EVT SrcVT = SV->getValueType(0);
+  SDValue V1 = SV->getOperand(0);
+  DebugLoc dl = SV->getDebugLoc();
+  int NumElems = SrcVT.getVectorNumElements();
+
+  assert(SrcVT.is256BitVector() && "unknown howto handle vector type");
+
+  SmallVector<int, 4> Mask;
+  for (int i = 0; i < NumElems/2; ++i)
+    Mask.push_back(SV->getMaskElt(i));
+
+  EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+                             NumElems/2);
+  SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1),
+                                     DAG.getUNDEF(SVT), &Mask[0]);
+  SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1,
+                                    DAG.getConstant(0, MVT::i32), DAG, dl);
+
+  return Insert128BitVector(InsV, SV1,
+                       DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+}
+
 /// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
 /// v8i32, v16i16 or v32i8 to v8f32.
 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
@@ -5742,7 +5770,17 @@
     if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
       return Op;
 
-    // Handle splats by matching through known masks
+    // Since there's no native support for scalar_to_vector for 256-bit AVX, a
+    // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this
+    // idiom and do the shuffle before the insertion, this yields less
+    // instructions in the end.
+    if (VT.is256BitVector() &&
+        V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+        V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return PromoteVectorToScalarSplat(SVOp, DAG);
+
+    // Handle splats by matching through known shuffle masks
     if ((VT.is128BitVector() && NumElem <= 4) ||
         (VT.is256BitVector() && NumElem <= 8))
       return SDValue();
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 6d89bcc..34d2676 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -501,6 +501,9 @@
 class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
       : PDI<o, F, outs, ins, asm, pattern>, REX_W;
+class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : VPDI<o, F, outs, ins, asm, pattern>, VEX_W;
 
 // MMX Instruction templates
 //
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index f792f53..4dd2d41 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -467,3 +467,4 @@
                                                    node:$index), [{
   return X86::isVINSERTF128Index(N);
 }], INSERT_get_vinsertf128_imm>;
+
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 24c734a..82e0d2a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -2858,6 +2858,14 @@
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
                       VEX;
+def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))]>, VEX;
+def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))]>, VEX;
+
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -5358,6 +5366,20 @@
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 
+// Special COPY patterns
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+
 //===----------------------------------------------------------------------===//
 // VEXTRACTF128 - Extract packed floating-point values
 //
diff --git a/llvm/test/CodeGen/X86/avx-256-splat.ll b/llvm/test/CodeGen/X86/avx-256-splat.ll
index 39230fe..edc17b7 100644
--- a/llvm/test/CodeGen/X86/avx-256-splat.ll
+++ b/llvm/test/CodeGen/X86/avx-256-splat.ll
@@ -5,7 +5,6 @@
 ; CHECK: vextractf128 $0
 ; CHECK-NEXT: punpcklbw
 ; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: vinsertf128 $0
 ; CHECK-NEXT: vinsertf128 $1
 ; CHECK-NEXT: vpermilps $85
 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
@@ -16,7 +15,6 @@
 
 ; CHECK: vextractf128 $0
 ; CHECK-NEXT: punpckhwd
-; CHECK-NEXT: vinsertf128 $0
 ; CHECK-NEXT: vinsertf128 $1
 ; CHECK-NEXT: vpermilps $85
 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
@@ -25,3 +23,25 @@
   ret <16 x i16> %shuffle
 }
 
+; CHECK: vmovd
+; CHECK-NEXT: movlhps
+; CHECK-NEXT: vinsertf128 $1
+define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
+entry:
+  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
+  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
+  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
+  ret <4 x i64> %vecinit6.i
+}
+
+; CHECK: vshufpd
+; CHECK-NEXT: vinsertf128 $1
+define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
+entry:
+  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
+  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
+  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
+  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
+  ret <4 x double> %vecinit6.i
+}
diff --git a/llvm/test/CodeGen/X86/avx-256.ll b/llvm/test/CodeGen/X86/avx-256.ll
index 20d31e7..a6d1450 100644
--- a/llvm/test/CodeGen/X86/avx-256.ll
+++ b/llvm/test/CodeGen/X86/avx-256.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 @x = common global <8 x float> zeroinitializer, align 32
 @y = common global <4 x double> zeroinitializer, align 32
@@ -12,4 +12,3 @@
   store <4 x double> zeroinitializer, <4 x double>* @y, align 32
   ret void
 }
-