[x86] Remove the 2-addr-to-3-addr "optimization" from shufps to pshufd.
This trades a (register-renamer-friendly) movaps for a floating point
/ integer domain cross. That is a very bad trade, even on architectures
where domain crossing is relatively fast. On any chip where there is
even a cycle stall, this is a Very Bad Idea. It doesn't even seem likely
to cause a spill to be introduced because the reason for the copy is to
destructively shuffle in place.
Thanks to Ben Kramer for fixing a bug in this code that my new shuffle
lowering exposed and highlighting that perhaps it should just go away.
=]
llvm-svn: 219090
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 87b0520..8ad9e5a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -396,14 +396,16 @@
; SSE2: # BB#0:
; SSE2-NEXT: andps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test4b:
; SSSE3: # BB#0:
; SSSE3-NEXT: andps %xmm1, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test4b:
@@ -435,14 +437,16 @@
; SSE2: # BB#0:
; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test5b:
; SSSE3: # BB#0:
; SSSE3-NEXT: orps %xmm1, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test5b:
@@ -1124,7 +1128,8 @@
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test1:
@@ -1252,7 +1257,8 @@
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test6:
@@ -1651,7 +1657,8 @@
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test2b:
@@ -1660,7 +1667,8 @@
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test2b:
@@ -1668,7 +1676,8 @@
; SSE41-NEXT: movaps %xmm1, %xmm2
; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test2b: