- Handle special scalar_to_vector case: splats. Using a native 128-bit
shuffle before inserting on a 256-bit vector.
- Add AVX versions of movd/movq instructions
- Introduce a few COPY patterns to match insert_subvector instructions.
This turns a trivial insert_subvector instruction into a register copy,
coalescing the xmm into a ymm and avoid emiting on more instruction.

llvm-svn: 136002
diff --git a/llvm/test/CodeGen/X86/avx-256-splat.ll b/llvm/test/CodeGen/X86/avx-256-splat.ll
index 39230fe..edc17b7 100644
--- a/llvm/test/CodeGen/X86/avx-256-splat.ll
+++ b/llvm/test/CodeGen/X86/avx-256-splat.ll
@@ -5,7 +5,6 @@
 ; CHECK: vextractf128 $0
 ; CHECK-NEXT: punpcklbw
 ; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: vinsertf128 $0
 ; CHECK-NEXT: vinsertf128 $1
 ; CHECK-NEXT: vpermilps $85
 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
@@ -16,7 +15,6 @@
 
 ; CHECK: vextractf128 $0
 ; CHECK-NEXT: punpckhwd
-; CHECK-NEXT: vinsertf128 $0
 ; CHECK-NEXT: vinsertf128 $1
 ; CHECK-NEXT: vpermilps $85
 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
@@ -25,3 +23,25 @@
   ret <16 x i16> %shuffle
 }
 
+; CHECK: vmovd
+; CHECK-NEXT: movlhps
+; CHECK-NEXT: vinsertf128 $1
+define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
+entry:
+  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
+  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
+  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
+  ret <4 x i64> %vecinit6.i
+}
+
+; CHECK: vshufpd
+; CHECK-NEXT: vinsertf128 $1
+define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
+entry:
+  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
+  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
+  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
+  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
+  ret <4 x double> %vecinit6.i
+}
diff --git a/llvm/test/CodeGen/X86/avx-256.ll b/llvm/test/CodeGen/X86/avx-256.ll
index 20d31e7..a6d1450 100644
--- a/llvm/test/CodeGen/X86/avx-256.ll
+++ b/llvm/test/CodeGen/X86/avx-256.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 @x = common global <8 x float> zeroinitializer, align 32
 @y = common global <4 x double> zeroinitializer, align 32
@@ -12,4 +12,3 @@
   store <4 x double> zeroinitializer, <4 x double>* @y, align 32
   ret void
 }
-