[X86] Prefer unpckhpd over movhlps in isel for fake unary cases

In r337348, I changed lowering to prefer X86ISD::UNPCKL/UNPCKH opcodes over MOVLHPS/MOVHLPS for v2f64 {0,0} and {1,1} shuffles when we have SSE2. This enabled the removal of a bunch of weirdly bitcasted isel patterns in r337349. To avoid changing the tests I placed a gross hack in isel to still emit movhlps instructions for fake unary unpckh nodes. A similar hack was not needed for unpckl and movlhps because we do execution domain switching for those. But unpckh and movhlps have swapped operand order.

This patch removes the hack.

This is a code size increase since unpckhpd requires a 0x66 prefix and movhlps does not. But if that's a big concern we should be using movhlps for all unpckhpd opcodes and let commuteInstruction turnit into unpckhpd when its an advantage.

Differential Revision: https://reviews.llvm.org/D49499

llvm-svn: 341973
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
index 411acd8..a4b255c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
@@ -50,7 +50,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
 ; SSE2-NEXT:    movaps %xmm1, %xmm2
-; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
@@ -62,7 +62,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
 ; SSE41-NEXT:    movaps %xmm1, %xmm2
-; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
@@ -101,7 +101,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[2,3]
 ; SSE2-NEXT:    mulss %xmm3, %xmm0
 ; SSE2-NEXT:    movaps %xmm1, %xmm3
-; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
 ; SSE2-NEXT:    mulss %xmm3, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
@@ -110,7 +110,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm2, %xmm1
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
@@ -122,7 +122,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm3, %xmm0
 ; SSE41-NEXT:    movaps %xmm1, %xmm3
-; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
 ; SSE41-NEXT:    mulss %xmm3, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
@@ -130,7 +130,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm2, %xmm1
-; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
@@ -187,7 +187,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[2,3]
 ; SSE2-NEXT:    mulss %xmm5, %xmm0
 ; SSE2-NEXT:    movaps %xmm1, %xmm5
-; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
 ; SSE2-NEXT:    mulss %xmm5, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
@@ -196,7 +196,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm2, %xmm1
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
@@ -205,7 +205,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm3, %xmm1
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm3, %xmm0
@@ -214,7 +214,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm4, %xmm1
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm4, %xmm0
@@ -226,7 +226,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm5, %xmm0
 ; SSE41-NEXT:    movaps %xmm1, %xmm5
-; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
 ; SSE41-NEXT:    mulss %xmm5, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
@@ -234,7 +234,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm2, %xmm1
-; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
@@ -242,7 +242,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm3, %xmm1
-; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm3, %xmm0
@@ -250,7 +250,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm4, %xmm1
-; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm4, %xmm0
@@ -371,7 +371,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
 ; SSE2-NEXT:    mulss %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm2
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
@@ -382,7 +382,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm0, %xmm2
-; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm2
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
@@ -418,7 +418,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
 ; SSE2-NEXT:    mulss %xmm0, %xmm2
 ; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
 ; SSE2-NEXT:    mulss %xmm2, %xmm3
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm3, %xmm0
@@ -427,7 +427,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
 ; SSE2-NEXT:    movaps %xmm1, %xmm2
-; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
@@ -438,7 +438,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm0, %xmm2
 ; SSE41-NEXT:    movaps %xmm0, %xmm3
-; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
 ; SSE41-NEXT:    mulss %xmm2, %xmm3
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm3, %xmm0
@@ -446,7 +446,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
 ; SSE41-NEXT:    movaps %xmm1, %xmm2
-; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
@@ -500,7 +500,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3]
 ; SSE2-NEXT:    mulss %xmm0, %xmm4
 ; SSE2-NEXT:    movaps %xmm0, %xmm5
-; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
 ; SSE2-NEXT:    mulss %xmm4, %xmm5
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm5, %xmm0
@@ -509,7 +509,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
 ; SSE2-NEXT:    mulss %xmm4, %xmm0
 ; SSE2-NEXT:    movaps %xmm1, %xmm4
-; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
 ; SSE2-NEXT:    mulss %xmm4, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
@@ -518,7 +518,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm2, %xmm1
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
@@ -527,7 +527,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm3, %xmm1
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm3, %xmm0
@@ -538,7 +538,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm0, %xmm4
 ; SSE41-NEXT:    movaps %xmm0, %xmm5
-; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
 ; SSE41-NEXT:    mulss %xmm4, %xmm5
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm5, %xmm0
@@ -546,7 +546,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm4, %xmm0
 ; SSE41-NEXT:    movaps %xmm1, %xmm4
-; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
 ; SSE41-NEXT:    mulss %xmm4, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
@@ -554,7 +554,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm2, %xmm1
-; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
@@ -562,7 +562,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm3, %xmm1
-; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm3, %xmm0
@@ -679,7 +679,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
 ; SSE2-NEXT:    mulss {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm2
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
@@ -690,7 +690,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    movaps %xmm0, %xmm2
-; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm2
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
@@ -726,7 +726,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
 ; SSE2-NEXT:    mulss {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
 ; SSE2-NEXT:    mulss %xmm2, %xmm3
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm3, %xmm0
@@ -735,7 +735,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
 ; SSE2-NEXT:    movaps %xmm1, %xmm2
-; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
@@ -746,7 +746,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    movaps %xmm0, %xmm3
-; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
 ; SSE41-NEXT:    mulss %xmm2, %xmm3
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm3, %xmm0
@@ -754,7 +754,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
 ; SSE41-NEXT:    movaps %xmm1, %xmm2
-; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
@@ -808,7 +808,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3]
 ; SSE2-NEXT:    mulss {{.*}}(%rip), %xmm4
 ; SSE2-NEXT:    movaps %xmm0, %xmm5
-; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
 ; SSE2-NEXT:    mulss %xmm4, %xmm5
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm5, %xmm0
@@ -817,7 +817,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
 ; SSE2-NEXT:    mulss %xmm4, %xmm0
 ; SSE2-NEXT:    movaps %xmm1, %xmm4
-; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
 ; SSE2-NEXT:    mulss %xmm4, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
@@ -826,7 +826,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm2, %xmm1
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm2, %xmm0
@@ -835,7 +835,7 @@
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm3, %xmm1
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; SSE2-NEXT:    mulss %xmm3, %xmm0
@@ -846,7 +846,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss {{.*}}(%rip), %xmm4
 ; SSE41-NEXT:    movaps %xmm0, %xmm5
-; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
 ; SSE41-NEXT:    mulss %xmm4, %xmm5
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm5, %xmm0
@@ -854,7 +854,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm4, %xmm0
 ; SSE41-NEXT:    movaps %xmm1, %xmm4
-; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
 ; SSE41-NEXT:    mulss %xmm4, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
@@ -862,7 +862,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm2, %xmm1
-; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm2, %xmm0
@@ -870,7 +870,7 @@
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm3, %xmm1
-; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; SSE41-NEXT:    mulss %xmm3, %xmm0
@@ -1182,8 +1182,8 @@
 define double @test_v2f64_one(<2 x double> %a0) {
 ; SSE-LABEL: test_v2f64_one:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE-NEXT:    mulsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -1206,8 +1206,8 @@
 define double @test_v4f64_one(<4 x double> %a0) {
 ; SSE-LABEL: test_v4f64_one:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 ; SSE-NEXT:    mulsd %xmm0, %xmm2
 ; SSE-NEXT:    mulsd %xmm1, %xmm2
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
@@ -1243,8 +1243,8 @@
 define double @test_v8f64_one(<8 x double> %a0) {
 ; SSE-LABEL: test_v8f64_one:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm4
-; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1]
+; SSE-NEXT:    movapd %xmm0, %xmm4
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
 ; SSE-NEXT:    mulsd %xmm0, %xmm4
 ; SSE-NEXT:    mulsd %xmm1, %xmm4
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]