Implement vector shift up / down and insert zero with ps{rl}lq / ps{rl}ldq.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@51667 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/mmx-insert-element.ll b/test/CodeGen/X86/mmx-insert-element.ll
index dc48836..0aa476d 100644
--- a/test/CodeGen/X86/mmx-insert-element.ll
+++ b/test/CodeGen/X86/mmx-insert-element.ll
@@ -1,23 +1,7 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | grep movq | count 3
-
-; FIXME: This code outputs:
-;
-;   subl $28, %esp
-;   movl 32(%esp), %eax
-;   movd %eax, %mm0
-;   movq %mm0, (%esp)
-;   movl (%esp), %eax
-;   movl %eax, 20(%esp)
-;   movq %mm0, 8(%esp)
-;   movl 12(%esp), %eax
-;   movl %eax, 16(%esp)
-;   movq 16(%esp), %mm0
-;   addl $28, %esp
-;
-; Which is ugly. We need to fix this.
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | not grep movq
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | grep psllq
 
 define <2 x i32> @qux(i32 %A) nounwind {
-entry:
 	%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1		; <<2 x i32>> [#uses=1]
 	ret <2 x i32> %tmp3
 }
diff --git a/test/CodeGen/X86/vec_clear.ll b/test/CodeGen/X86/vec_clear.ll
index d464129..c119a94 100644
--- a/test/CodeGen/X86/vec_clear.ll
+++ b/test/CodeGen/X86/vec_clear.ll
@@ -1,6 +1,7 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep and
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | grep psrldq
 
-define <4 x float> @test(<4 x float>* %v1) {
+define <4 x float> @test(<4 x float>* %v1) nounwind {
         %tmp = load <4 x float>* %v1            ; <<4 x float>> [#uses=1]
         %tmp15 = bitcast <4 x float> %tmp to <2 x i64>          ; <<2 x i64>> [#uses=1]
         %tmp24 = and <2 x i64> %tmp15, bitcast (<4 x i32> < i32 0, i32 0, i32 -1, i32 -1 > to <2 x i64>)              ; <<2 x i64>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll
index 1d374b4..e42a368 100644
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep punpcklqdq | count 1
 
-define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) {
+define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
         %tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
         ret <2 x i64> %tmp1
 }
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
new file mode 100644
index 0000000..eaa523e
--- /dev/null
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -0,0 +1,31 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psllq  | grep 32
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pslldq | grep 12
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psrldq | grep 8
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psrldq | grep 12
+
+define void  @t1(i32 %a, <1 x i64>* %P) nounwind {
+       %tmp12 = shl i32 %a, 12
+       %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
+       %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
+       %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
+       store <1 x i64> %tmp23, <1 x i64>* %P
+       ret void
+}
+
+define <4 x float> @t2(<4 x float>* %P) nounwind {
+        %tmp1 = load <4 x float>* %P
+        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
+        ret <4 x float> %tmp2
+}
+
+define <4 x float> @t3(<4 x float>* %P) nounwind {
+        %tmp1 = load <4 x float>* %P
+        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
+        ret <4 x float> %tmp2
+}
+
+define <4 x float> @t4(<4 x float>* %P) nounwind {
+        %tmp1 = load <4 x float>* %P
+        %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
+        ret <4 x float> %tmp2
+}
diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll
new file mode 100644
index 0000000..405152e
--- /dev/null
+++ b/test/CodeGen/X86/vec_insert-6.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep pslldq
+
+define <4 x float> @t3(<4 x float>* %P) nounwind  {
+	%tmp1 = load <4 x float>* %P
+	%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
+	ret <4 x float> %tmp2
+}