[DAGCombine][X86][AArch64] Masked merge unfolding: vector edition.

Summary:
This **appears** to be the last missing piece for the masked merge pattern handling in the backend.

This is [[ https://bugs.llvm.org/show_bug.cgi?id=37104 | PR37104 ]].

[[ https://bugs.llvm.org/show_bug.cgi?id=6773 | PR6773 ]] will introduce an IR canonicalization that is likely bad for the end assembly.
Previously, `andps`+`andnps` / `bsl` would be generated. (see `@out`)
Now, they would no longer be generated  (see `@in`), and we need to make sure that they are generated.

Differential Revision: https://reviews.llvm.org/D46528

llvm-svn: 332904
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
index 0646e62..ee150f1 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
@@ -77,9 +77,8 @@
 ; CHECK-LABEL: in_constant_varx_42:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -107,9 +106,8 @@
 ; CHECK-LABEL: in_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bsl v2.16b, v1.16b, v0.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
@@ -134,8 +132,8 @@
 define <4 x i32> @in_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_mone_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v0.16b, v2.16b, v1.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bic v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -161,9 +159,8 @@
 define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_mone_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v0.16b, v1.16b
-; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    orn v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
@@ -189,10 +186,9 @@
 define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.4s, #42
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    movi v2.4s, #42
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -219,10 +215,9 @@
 define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.4s, #42
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    movi v2.4s, #42
+; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
index c3199cc..df86540 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
@@ -270,9 +270,8 @@
 define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i8> %x, %y
   %n1 = and <1 x i8> %n0, %mask
@@ -287,9 +286,8 @@
 define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i8> %x, %y
   %n1 = and <2 x i8> %n0, %mask
@@ -300,9 +298,8 @@
 define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i16> %x, %y
   %n1 = and <1 x i16> %n0, %mask
@@ -317,9 +314,8 @@
 define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i8> %x, %y
   %n1 = and <4 x i8> %n0, %mask
@@ -330,9 +326,8 @@
 define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i16> %x, %y
   %n1 = and <2 x i16> %n0, %mask
@@ -343,9 +338,8 @@
 define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i32> %x, %y
   %n1 = and <1 x i32> %n0, %mask
@@ -360,9 +354,8 @@
 define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <8 x i8> %x, %y
   %n1 = and <8 x i8> %n0, %mask
@@ -373,9 +366,8 @@
 define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i16> %x, %y
   %n1 = and <4 x i16> %n0, %mask
@@ -386,9 +378,8 @@
 define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i32> %x, %y
   %n1 = and <2 x i32> %n0, %mask
@@ -399,9 +390,8 @@
 define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i64> %x, %y
   %n1 = and <1 x i64> %n0, %mask
@@ -416,9 +406,8 @@
 define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <16 x i8> %x, %y
   %n1 = and <16 x i8> %n0, %mask
@@ -429,9 +418,8 @@
 define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <8 x i16> %x, %y
   %n1 = and <8 x i16> %n0, %mask
@@ -442,9 +430,8 @@
 define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> %x, %y
   %n1 = and <4 x i32> %n0, %mask
@@ -455,9 +442,8 @@
 define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i64> %x, %y
   %n1 = and <2 x i64> %n0, %mask
diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll
index 8aaf21c..3b6fcd7 100644
--- a/llvm/test/CodeGen/X86/machine-cp.ll
+++ b/llvm/test/CodeGen/X86/machine-cp.ll
@@ -101,54 +101,64 @@
 define <16 x float> @foo(<16 x float> %x) {
 ; CHECK-LABEL: foo:
 ; CHECK:       ## %bb.0: ## %bb
-; CHECK-NEXT:    xorps %xmm8, %xmm8
-; CHECK-NEXT:    cvttps2dq %xmm3, %xmm9
-; CHECK-NEXT:    movaps %xmm3, %xmm13
-; CHECK-NEXT:    cmpltps %xmm8, %xmm13
-; CHECK-NEXT:    movaps {{.*#+}} xmm7 = [1,1,1,1]
-; CHECK-NEXT:    movaps %xmm13, %xmm3
-; CHECK-NEXT:    andps %xmm7, %xmm3
-; CHECK-NEXT:    cvttps2dq %xmm2, %xmm10
-; CHECK-NEXT:    movaps %xmm2, %xmm5
-; CHECK-NEXT:    cmpltps %xmm8, %xmm5
-; CHECK-NEXT:    movaps %xmm5, %xmm2
-; CHECK-NEXT:    andps %xmm7, %xmm2
-; CHECK-NEXT:    cvttps2dq %xmm1, %xmm11
+; CHECK-NEXT:    movaps %xmm3, %xmm9
+; CHECK-NEXT:    movaps %xmm2, %xmm8
+; CHECK-NEXT:    movaps %xmm1, %xmm6
+; CHECK-NEXT:    movaps %xmm0, %xmm7
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movaps %xmm3, %xmm1
+; CHECK-NEXT:    cmpltps %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm4
-; CHECK-NEXT:    cmpltps %xmm8, %xmm4
-; CHECK-NEXT:    movaps %xmm4, %xmm1
-; CHECK-NEXT:    andps %xmm7, %xmm1
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm12
-; CHECK-NEXT:    movaps %xmm0, %xmm6
-; CHECK-NEXT:    cmpltps %xmm8, %xmm6
-; CHECK-NEXT:    andps %xmm6, %xmm7
-; CHECK-NEXT:    orps {{.*}}(%rip), %xmm6
-; CHECK-NEXT:    movaps {{.*#+}} xmm14 = [5,6,7,8]
-; CHECK-NEXT:    orps %xmm14, %xmm4
-; CHECK-NEXT:    movaps {{.*#+}} xmm15 = [9,10,11,12]
-; CHECK-NEXT:    orps %xmm15, %xmm5
-; CHECK-NEXT:    movaps {{.*#+}} xmm8 = [13,14,15,16]
-; CHECK-NEXT:    orps %xmm8, %xmm13
-; CHECK-NEXT:    cvtdq2ps %xmm12, %xmm0
-; CHECK-NEXT:    cvtdq2ps %xmm11, %xmm11
-; CHECK-NEXT:    cvtdq2ps %xmm10, %xmm10
-; CHECK-NEXT:    cvtdq2ps %xmm9, %xmm9
-; CHECK-NEXT:    andps %xmm8, %xmm9
-; CHECK-NEXT:    andps %xmm15, %xmm10
-; CHECK-NEXT:    andps %xmm14, %xmm11
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    xorps %xmm7, %xmm0
+; CHECK-NEXT:    orps {{.*}}(%rip), %xmm4
+; CHECK-NEXT:    movaps %xmm4, %xmm10
+; CHECK-NEXT:    andnps %xmm1, %xmm10
+; CHECK-NEXT:    movaps %xmm2, %xmm1
+; CHECK-NEXT:    cmpltps %xmm0, %xmm1
+; CHECK-NEXT:    movaps {{.*#+}} xmm11 = [9,10,11,12]
+; CHECK-NEXT:    movaps %xmm1, %xmm3
+; CHECK-NEXT:    orps %xmm11, %xmm3
+; CHECK-NEXT:    movaps %xmm3, %xmm14
+; CHECK-NEXT:    andnps %xmm1, %xmm14
+; CHECK-NEXT:    cvttps2dq %xmm6, %xmm12
+; CHECK-NEXT:    cmpltps %xmm0, %xmm6
+; CHECK-NEXT:    movaps {{.*#+}} xmm13 = [5,6,7,8]
+; CHECK-NEXT:    movaps %xmm6, %xmm2
+; CHECK-NEXT:    orps %xmm13, %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm5
+; CHECK-NEXT:    andnps %xmm6, %xmm5
+; CHECK-NEXT:    cvttps2dq %xmm7, %xmm6
+; CHECK-NEXT:    cmpltps %xmm0, %xmm7
+; CHECK-NEXT:    movaps {{.*#+}} xmm15 = [1,2,3,4]
+; CHECK-NEXT:    movaps %xmm7, %xmm0
+; CHECK-NEXT:    orps %xmm15, %xmm0
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    andnps %xmm7, %xmm1
+; CHECK-NEXT:    andps %xmm15, %xmm0
+; CHECK-NEXT:    cvtdq2ps %xmm6, %xmm6
 ; CHECK-NEXT:    andps %xmm6, %xmm0
-; CHECK-NEXT:    xorps %xmm1, %xmm11
-; CHECK-NEXT:    andps %xmm4, %xmm11
-; CHECK-NEXT:    xorps %xmm2, %xmm10
-; CHECK-NEXT:    andps %xmm5, %xmm10
-; CHECK-NEXT:    xorps %xmm3, %xmm9
-; CHECK-NEXT:    andps %xmm13, %xmm9
-; CHECK-NEXT:    xorps %xmm7, %xmm0
-; CHECK-NEXT:    xorps %xmm11, %xmm1
-; CHECK-NEXT:    xorps %xmm10, %xmm2
-; CHECK-NEXT:    xorps %xmm9, %xmm3
+; CHECK-NEXT:    movaps {{.*#+}} xmm6 = [1,1,1,1]
+; CHECK-NEXT:    andps %xmm6, %xmm1
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    andps %xmm13, %xmm2
+; CHECK-NEXT:    cvtdq2ps %xmm12, %xmm1
+; CHECK-NEXT:    andps %xmm1, %xmm2
+; CHECK-NEXT:    andps %xmm6, %xmm5
+; CHECK-NEXT:    orps %xmm5, %xmm2
+; CHECK-NEXT:    andps %xmm11, %xmm3
+; CHECK-NEXT:    cvttps2dq %xmm8, %xmm1
+; CHECK-NEXT:    cvtdq2ps %xmm1, %xmm1
+; CHECK-NEXT:    andps %xmm1, %xmm3
+; CHECK-NEXT:    andps %xmm6, %xmm14
+; CHECK-NEXT:    orps %xmm14, %xmm3
+; CHECK-NEXT:    andps %xmm6, %xmm10
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm4
+; CHECK-NEXT:    cvttps2dq %xmm9, %xmm1
+; CHECK-NEXT:    cvtdq2ps %xmm1, %xmm1
+; CHECK-NEXT:    andps %xmm1, %xmm4
+; CHECK-NEXT:    orps %xmm10, %xmm4
+; CHECK-NEXT:    movaps %xmm2, %xmm1
+; CHECK-NEXT:    movaps %xmm3, %xmm2
+; CHECK-NEXT:    movaps %xmm4, %xmm3
 ; CHECK-NEXT:    retq
 bb:
   %v3 = icmp slt <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
index 3088e5e..20c3ef8 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
@@ -58,18 +58,20 @@
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_mone:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pandn (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pand (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_constant_varx_mone:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
 ; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vpandn (%rdx), %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -132,21 +134,22 @@
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm2
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm1
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pandn (%rdi), %xmm1
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_constant_varx_mone_invmask:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
 ; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vpxor (%rdx), %xmm1, %xmm2
-; CHECK-XOP-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpandn (%rdi), %xmm0, %xmm2
+; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -198,30 +201,29 @@
 define <4 x i32> @in_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_varx_42:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm0 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT:    andps (%rcx), %xmm1
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_42:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [42,42,42,42]
-; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_constant_varx_42:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [42,42,42,42]
-; CHECK-XOP-NEXT:    vxorps (%rdi), %xmm0, %xmm1
-; CHECK-XOP-NEXT:    vandps (%rdx), %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT:    vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -275,11 +277,10 @@
 ; CHECK-SSE1-LABEL: in_constant_varx_42_invmask:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
-; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
-; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm2
-; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm2
-; CHECK-SSE1-NEXT:    andnps %xmm2, %xmm0
-; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
+; CHECK-SSE1-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
@@ -287,20 +288,17 @@
 ; CHECK-SSE2-LABEL: in_constant_varx_42_invmask:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [42,42,42,42]
-; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm2
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    andnps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rdi), %xmm1
+; CHECK-SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_constant_varx_42_invmask:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rdx), %xmm0
-; CHECK-XOP-NEXT:    vmovaps {{.*#+}} xmm1 = [42,42,42,42]
-; CHECK-XOP-NEXT:    vxorps (%rdi), %xmm1, %xmm2
-; CHECK-XOP-NEXT:    vandnps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rdi), %xmm1, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -350,27 +348,27 @@
 define <4 x i32> @in_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_mone_vary:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
-; CHECK-SSE1-NEXT:    andnps (%rcx), %xmm1
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
+; CHECK-SSE1-NEXT:    orps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_mone_vary:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm1
 ; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    andnps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_constant_mone_vary:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT:    vandnps (%rdx), %xmm0, %xmm1
-; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    vmovaps (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vandnps (%rsi), %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -425,33 +423,31 @@
 define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
-; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm1
-; CHECK-SSE1-NEXT:    xorps {{.*}}(%rip), %xmm1
-; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm2
-; CHECK-SSE1-NEXT:    andnps %xmm1, %xmm2
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm2
-; CHECK-SSE1-NEXT:    movaps %xmm2, (%rdi)
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa (%rsi), %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    pxor (%rdx), %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pand (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_constant_mone_vary_invmask:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovdqa (%rsi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
 ; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vpxor (%rdx), %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vpandn %xmm1, %xmm0, %xmm1
-; CHECK-XOP-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vpand (%rsi), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -503,30 +499,29 @@
 define <4 x i32> @in_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_42_vary:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
-; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT:    andps (%rcx), %xmm1
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
+; CHECK-SSE1-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_42_vary:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
-; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42]
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_constant_42_vary:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-XOP-NEXT:    vandps (%rdx), %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rsi), %xmm1, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -579,33 +574,29 @@
 define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_42_vary_invmask:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
-; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm1
-; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm2 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm2
-; CHECK-SSE1-NEXT:    andnps %xmm2, %xmm1
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm1
+; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_42_vary_invmask:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
 ; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm2 = [42,42,42,42]
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    andnps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_constant_42_vary_invmask:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT:    vmovaps (%rdx), %xmm1
-; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-XOP-NEXT:    vandnps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rsi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT:    vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index fac4f4a..f18176c 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -2607,16 +2607,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v2i8:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v2i8:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <2 x i8> %x, %y
   %n1 = and <2 x i8> %n0, %mask
@@ -2693,16 +2691,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v4i8:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v4i8:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <4 x i8> %x, %y
   %n1 = and <4 x i8> %n0, %mask
@@ -2737,16 +2733,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v2i16:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v2i16:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <2 x i16> %x, %y
   %n1 = and <2 x i16> %n0, %mask
@@ -2895,16 +2889,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v8i8:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v8i8:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <8 x i8> %x, %y
   %n1 = and <8 x i8> %n0, %mask
@@ -2963,16 +2955,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v4i16:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v4i16:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <4 x i16> %x, %y
   %n1 = and <4 x i16> %n0, %mask
@@ -3007,16 +2997,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v2i32:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v2i32:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <2 x i32> %x, %y
   %n1 = and <2 x i32> %n0, %mask
@@ -3273,16 +3261,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v16i8:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v16i8:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <16 x i8> %x, %y
   %n1 = and <16 x i8> %n0, %mask
@@ -3401,16 +3387,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v8i16:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v8i16:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <8 x i16> %x, %y
   %n1 = and <8 x i16> %n0, %mask
@@ -3452,30 +3436,29 @@
 ;
 ; CHECK-SSE1-LABEL: in_v4i32:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
-; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT:    andps (%rcx), %xmm1
-; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
+; CHECK-SSE1-NEXT:    andps (%rsi), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_v4i32:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
-; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v4i32:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT:    vxorps (%rdi), %xmm0, %xmm1
-; CHECK-XOP-NEXT:    vandps (%rdx), %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i32>, <4 x i32> *%px, align 16
   %y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -3513,16 +3496,14 @@
 ;
 ; CHECK-SSE2-LABEL: in_v2i64:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v2i64:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
   %n0 = xor <2 x i64> %x, %y
   %n1 = and <2 x i64> %n0, %mask
@@ -4067,24 +4048,23 @@
 ;
 ; CHECK-SSE2-LABEL: in_v32i8:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm2
-; CHECK-SSE2-NEXT:    movaps 16(%rsi), %xmm3
-; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm1
-; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    andps 16(%rdx), %xmm1
-; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
+; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
+; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v32i8:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rsi), %ymm0
-; CHECK-XOP-NEXT:    vxorps (%rdi), %ymm0, %ymm1
-; CHECK-XOP-NEXT:    vandps (%rdx), %ymm1, %ymm1
-; CHECK-XOP-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <32 x i8>, <32 x i8> *%px, align 32
   %y = load <32 x i8>, <32 x i8> *%py, align 32
@@ -4402,24 +4382,23 @@
 ;
 ; CHECK-SSE2-LABEL: in_v16i16:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm2
-; CHECK-SSE2-NEXT:    movaps 16(%rsi), %xmm3
-; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm1
-; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    andps 16(%rdx), %xmm1
-; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
+; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
+; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v16i16:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rsi), %ymm0
-; CHECK-XOP-NEXT:    vxorps (%rdi), %ymm0, %ymm1
-; CHECK-XOP-NEXT:    vandps (%rdx), %ymm1, %ymm1
-; CHECK-XOP-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <16 x i16>, <16 x i16> *%px, align 32
   %y = load <16 x i16>, <16 x i16> *%py, align 32
@@ -4571,24 +4550,23 @@
 ;
 ; CHECK-SSE2-LABEL: in_v8i32:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm2
-; CHECK-SSE2-NEXT:    movaps 16(%rsi), %xmm3
-; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm1
-; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    andps 16(%rdx), %xmm1
-; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
+; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
+; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v8i32:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rsi), %ymm0
-; CHECK-XOP-NEXT:    vxorps (%rdi), %ymm0, %ymm1
-; CHECK-XOP-NEXT:    vandps (%rdx), %ymm1, %ymm1
-; CHECK-XOP-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <8 x i32>, <8 x i32> *%px, align 32
   %y = load <8 x i32>, <8 x i32> *%py, align 32
@@ -4664,24 +4642,23 @@
 ;
 ; CHECK-SSE2-LABEL: in_v4i64:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm2
-; CHECK-SSE2-NEXT:    movaps 16(%rsi), %xmm3
-; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm1
-; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    andps 16(%rdx), %xmm1
-; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
+; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
+; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-XOP-LABEL: in_v4i64:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovaps (%rsi), %ymm0
-; CHECK-XOP-NEXT:    vxorps (%rdi), %ymm0, %ymm1
-; CHECK-XOP-NEXT:    vandps (%rdx), %ymm1, %ymm1
-; CHECK-XOP-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
 ; CHECK-XOP-NEXT:    retq
   %x = load <4 x i64>, <4 x i64> *%px, align 32
   %y = load <4 x i64>, <4 x i64> *%py, align 32