Generate better code for v8i16 shuffles on SSE2
Generate better code for v16i8 shuffles on SSE2 (avoids stack)
Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops.
Document the shuffle matching logic and add some FIXMEs for later further
  cleanups.
New tests that test the above.

Examples:

New:
_shuf2:
	pextrw	$7, %xmm0, %eax
	punpcklqdq	%xmm1, %xmm0
	pshuflw	$128, %xmm0, %xmm0
	pinsrw	$2, %eax, %xmm0

Old:
_shuf2:
	pextrw	$2, %xmm0, %eax
	pextrw	$7, %xmm0, %ecx
	pinsrw	$2, %ecx, %xmm0
	pinsrw	$3, %eax, %xmm0
	movd	%xmm1, %eax
	pinsrw	$4, %eax, %xmm0
	ret

=========

New:
_shuf4:
	punpcklqdq	%xmm1, %xmm0
	pshufb	LCPI1_0, %xmm0

Old:
_shuf4:
	pextrw	$3, %xmm0, %eax
	movsd	%xmm1, %xmm0
	pextrw	$3, %xmm1, %ecx
	pinsrw	$4, %ecx, %xmm0
	pinsrw	$5, %eax, %xmm0

========

New:
_shuf1:
	pushl	%ebx
	pushl	%edi
	pushl	%esi
	pextrw	$1, %xmm0, %eax
	rolw	$8, %ax
	movd	%xmm0, %ecx
	rolw	$8, %cx
	pextrw	$5, %xmm0, %edx
	pextrw	$4, %xmm0, %esi
	pextrw	$3, %xmm0, %edi
	pextrw	$2, %xmm0, %ebx
	movaps	%xmm0, %xmm1
	pinsrw	$0, %ecx, %xmm1
	pinsrw	$1, %eax, %xmm1
	rolw	$8, %bx
	pinsrw	$2, %ebx, %xmm1
	rolw	$8, %di
	pinsrw	$3, %edi, %xmm1
	rolw	$8, %si
	pinsrw	$4, %esi, %xmm1
	rolw	$8, %dx
	pinsrw	$5, %edx, %xmm1
	pextrw	$7, %xmm0, %eax
	rolw	$8, %ax
	movaps	%xmm1, %xmm0
	pinsrw	$7, %eax, %xmm0
	popl	%esi
	popl	%edi
	popl	%ebx
	ret

Old:
_shuf1:
	subl	$252, %esp
	movaps	%xmm0, (%esp)
	movaps	%xmm0, 16(%esp)
	movaps	%xmm0, 32(%esp)
	movaps	%xmm0, 48(%esp)
	movaps	%xmm0, 64(%esp)
	movaps	%xmm0, 80(%esp)
	movaps	%xmm0, 96(%esp)
	movaps	%xmm0, 224(%esp)
	movaps	%xmm0, 208(%esp)
	movaps	%xmm0, 192(%esp)
	movaps	%xmm0, 176(%esp)
	movaps	%xmm0, 160(%esp)
	movaps	%xmm0, 144(%esp)
	movaps	%xmm0, 128(%esp)
	movaps	%xmm0, 112(%esp)
	movzbl	14(%esp), %eax
	movd	%eax, %xmm1
	movzbl	22(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm1, %xmm2
	movzbl	42(%esp), %eax
	movd	%eax, %xmm1
	movzbl	50(%esp), %eax
	movd	%eax, %xmm3
	punpcklbw	%xmm1, %xmm3
	punpcklbw	%xmm2, %xmm3
	movzbl	77(%esp), %eax
	movd	%eax, %xmm1
	movzbl	84(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm1, %xmm2
	movzbl	104(%esp), %eax
	movd	%eax, %xmm1
	punpcklbw	%xmm1, %xmm0
	punpcklbw	%xmm2, %xmm0
	movaps	%xmm0, %xmm1
	punpcklbw	%xmm3, %xmm1
	movzbl	127(%esp), %eax
	movd	%eax, %xmm0
	movzbl	135(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm0, %xmm2
	movzbl	155(%esp), %eax
	movd	%eax, %xmm0
	movzbl	163(%esp), %eax
	movd	%eax, %xmm3
	punpcklbw	%xmm0, %xmm3
	punpcklbw	%xmm2, %xmm3
	movzbl	188(%esp), %eax
	movd	%eax, %xmm0
	movzbl	197(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm0, %xmm2
	movzbl	217(%esp), %eax
	movd	%eax, %xmm4
	movzbl	225(%esp), %eax
	movd	%eax, %xmm0
	punpcklbw	%xmm4, %xmm0
	punpcklbw	%xmm2, %xmm0
	punpcklbw	%xmm3, %xmm0
	punpcklbw	%xmm1, %xmm0
	addl	$252, %esp
	ret



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@65311 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/vec_shuffle-12.ll b/test/CodeGen/X86/vec_shuffle-12.ll
index aad27ea..98b455a 100644
--- a/test/CodeGen/X86/vec_shuffle-12.ll
+++ b/test/CodeGen/X86/vec_shuffle-12.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 > %t
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah > %t
 ; RUN: not grep punpck %t
 ; RUN: grep pextrw %t | count 4
 ; RUN: grep pinsrw %t | count 6
-; RUN: grep pshuflw %t | count 3
+; RUN: grep pshuflw %t | count 1
 ; RUN: grep pshufhw %t | count 2
 
 define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
diff --git a/test/CodeGen/X86/vec_shuffle-13.ll b/test/CodeGen/X86/vec_shuffle-13.ll
index 4511e95..61cd128 100644
--- a/test/CodeGen/X86/vec_shuffle-13.ll
+++ b/test/CodeGen/X86/vec_shuffle-13.ll
@@ -1,7 +1,7 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 > %t
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah > %t
 ; RUN: grep movlhps %t | count 1
-; RUN: grep movss %t | count 1
 ; RUN: grep pshufd %t | count 1
+; RUN: grep movss %t | count 1
 ; RUN: grep pshuflw %t | count 1
 ; RUN: grep pshufhw %t | count 1
 
diff --git a/test/CodeGen/X86/vec_shuffle-2.ll b/test/CodeGen/X86/vec_shuffle-2.ll
index ae69801..b049565 100644
--- a/test/CodeGen/X86/vec_shuffle-2.ll
+++ b/test/CodeGen/X86/vec_shuffle-2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
 ; RUN: grep pshufhw %t | count 1
 ; RUN: grep pshuflw %t | count 1
 ; RUN: grep movhps  %t | count 1
diff --git a/test/CodeGen/X86/vec_shuffle-21.ll b/test/CodeGen/X86/vec_shuffle-21.ll
index 5409acf..eec2691 100644
--- a/test/CodeGen/X86/vec_shuffle-21.ll
+++ b/test/CodeGen/X86/vec_shuffle-21.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
 ; RUN: grep pshuflw %t | count 1
 ; RUN: grep pextrw %t | count 2
 ; RUN: grep pinsrw %t | count 2
diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll
index 0c81e77..f7e5001 100644
--- a/test/CodeGen/X86/vec_shuffle-28.ll
+++ b/test/CodeGen/X86/vec_shuffle-28.ll
@@ -1,8 +1,12 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
-; RUN: grep punpcklwd %t | count 1
-; RUN: grep pextrw %t | count 6
-; RUN: grep pinsrw %t | count 8
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
+; RUN: grep movd %t | count 1
+; RUN: grep pshuflw %t | count 1
+; RUN: grep pinsrw %t | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f
+; RUN: grep pshufb %t | count 1
 
+; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently.
+;        Don't XFAIL it because it's still better than the previous code.
 
 ; Pack various elements via shuffles.
 define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
@@ -10,24 +14,3 @@
 	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
 	ret <8 x i16> %tmp7
 }
-
-
-define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
-	ret <8 x i16> %tmp8
-}
-
-
-define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
-	ret <8 x i16> %tmp9
-}
-
-
-define <8 x i16> @shuf4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
-	ret <8 x i16> %tmp9
-}
diff --git a/test/CodeGen/X86/vec_shuffle-29.ll b/test/CodeGen/X86/vec_shuffle-29.ll
index aac63c3..49cb0a7 100644
--- a/test/CodeGen/X86/vec_shuffle-29.ll
+++ b/test/CodeGen/X86/vec_shuffle-29.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -disable-mmx -o %t -f
+; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41,-ssse3 -disable-mmx -o %t -f
 ; RUN: not grep pextrw %t
 ; RUN: grep pinsrw %t
 
diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll
new file mode 100644
index 0000000..0a9dc1f
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-31.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
+; RUN: grep pextrw %t | count 1
+; RUN: grep punpcklqdq %t | count 1
+; RUN: grep pshufhw %t | count 1
+; RUN: grep pinsrw %t | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f
+; RUN: grep pshufb %t | count 1
+
+define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+entry:
+	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
+	ret <8 x i16> %tmp9
+}
diff --git a/test/CodeGen/X86/vec_shuffle-32.ll b/test/CodeGen/X86/vec_shuffle-32.ll
new file mode 100644
index 0000000..3a81948
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-32.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
+; RUN: grep punpcklqdq %t | count 1
+; RUN: grep pextrw %t | count 1
+; RUN: grep pshufd %t | count 1
+; RUN: grep pinsrw %t | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f
+; RUN: grep pshufb %t | count 1
+
+define <8 x i16> @shuf4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+entry:
+	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
+	ret <8 x i16> %tmp9
+}
diff --git a/test/CodeGen/X86/vec_shuffle-33.ll b/test/CodeGen/X86/vec_shuffle-33.ll
new file mode 100644
index 0000000..e3d6304
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-33.ll
@@ -0,0 +1,11 @@
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
+; RUN: grep punpcklqdq %t | count 1
+; RUN: grep pshufhw %t | count 1
+; RUN: not grep pextrw %t
+; RUN: not grep pinsrw %t
+
+define <8 x i16> @shuf5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+entry:
+	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
+	ret <8 x i16> %tmp9
+}
diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll
new file mode 100644
index 0000000..99c95d1
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-34.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
+; RUN: grep pextrw %t | count 1
+; RUN: grep punpcklqdq %t | count 1
+; RUN: grep pshuflw %t | count 1
+; RUN: grep pinsrw %t | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f
+; RUN: grep pshufb %t | count 2
+
+define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+entry:
+	%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
+	ret <8 x i16> %tmp8
+}
diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll
new file mode 100644
index 0000000..f2b241f
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-35.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
+; RUN: grep pextrw %t | count 13
+; RUN: grep pinsrw %t | count 14
+; RUN: grep rolw %t | count 13
+; RUN: not grep esp %t
+; RUN: not grep ebp %t
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f
+; RUN: grep pshufb %t | count 3
+
+define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone {
+entry:
+	%tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
+	ret <16 x i8> %tmp8
+}
+
+define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+entry:
+	%tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
+	ret <16 x i8> %tmp8
+}