[X86][Haswell] Updating HSW instruction scheduling information

This patch completely replaces the instruction scheduling information for the Haswell architecture target by modifying the file X86SchedHaswell.td located under the X86 Target.
We used the scheduling information retrieved from the Haswell architects in order to replace and modify the existing scheduling.
The patch continues the scheduling replacement effort started with the SNB target in r307529 and r310792.
Information includes latency, number of micro-Ops and used ports by each HSW instruction.

Please expect some performance fluctuations due to code alignment effects.

Reviewers: RKSimon, zvi, aymanmus, craig.topper, m_zuckerman, igorb, dim, chandlerc, aaboud

Differential Revision: https://reviews.llvm.org/D36663

llvm-svn: 311879
diff --git a/llvm/test/CodeGen/X86/aes-schedule.ll b/llvm/test/CodeGen/X86/aes-schedule.ll
index 372e395..ab86eda 100644
--- a/llvm/test/CodeGen/X86/aes-schedule.ll
+++ b/llvm/test/CodeGen/X86/aes-schedule.ll
@@ -32,7 +32,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaesdec %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
 ; HASWELL-NEXT:    vaesdec (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_aesdec:
 ; BTVER2:       # BB#0:
@@ -75,7 +75,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
 ; HASWELL-NEXT:    vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_aesdeclast:
 ; BTVER2:       # BB#0:
@@ -118,7 +118,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaesenc %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
 ; HASWELL-NEXT:    vaesenc (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_aesenc:
 ; BTVER2:       # BB#0:
@@ -161,7 +161,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaesenclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
 ; HASWELL-NEXT:    vaesenclast (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_aesenclast:
 ; BTVER2:       # BB#0:
@@ -208,7 +208,7 @@
 ; HASWELL-NEXT:    vaesimc %xmm0, %xmm0 # sched: [14:2.00]
 ; HASWELL-NEXT:    vaesimc (%rdi), %xmm1 # sched: [14:2.00]
 ; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_aesimc:
 ; BTVER2:       # BB#0:
@@ -255,10 +255,10 @@
 ;
 ; HASWELL-LABEL: test_aeskeygenassist:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0 # sched: [10:8.00]
-; HASWELL-NEXT:    vaeskeygenassist $7, (%rdi), %xmm1 # sched: [10:7.00]
+; HASWELL-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0 # sched: [29:7.00]
+; HASWELL-NEXT:    vaeskeygenassist $7, (%rdi), %xmm1 # sched: [28:7.00]
 ; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_aeskeygenassist:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll
index 23b30b5..17d2d91 100644
--- a/llvm/test/CodeGen/X86/avx-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx-schedule.ll
@@ -23,8 +23,8 @@
 ; HASWELL-LABEL: test_addpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addpd:
 ; BTVER2:       # BB#0:
@@ -59,8 +59,8 @@
 ; HASWELL-LABEL: test_addps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addps:
 ; BTVER2:       # BB#0:
@@ -95,8 +95,8 @@
 ; HASWELL-LABEL: test_addsubpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addsubpd:
 ; BTVER2:       # BB#0:
@@ -132,8 +132,8 @@
 ; HASWELL-LABEL: test_addsubps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addsubps:
 ; BTVER2:       # BB#0:
@@ -171,9 +171,9 @@
 ; HASWELL-LABEL: test_andnotpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andnotpd:
 ; BTVER2:       # BB#0:
@@ -219,9 +219,9 @@
 ; HASWELL-LABEL: test_andnotps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andnotps:
 ; BTVER2:       # BB#0:
@@ -267,9 +267,9 @@
 ; HASWELL-LABEL: test_andpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andpd:
 ; BTVER2:       # BB#0:
@@ -313,9 +313,9 @@
 ; HASWELL-LABEL: test_andps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andps:
 ; BTVER2:       # BB#0:
@@ -360,8 +360,8 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendpd:
 ; BTVER2:       # BB#0:
@@ -399,8 +399,8 @@
 ; HASWELL-LABEL: test_blendps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
-; HASWELL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendps:
 ; BTVER2:       # BB#0:
@@ -435,8 +435,8 @@
 ; HASWELL-LABEL: test_blendvpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendvpd:
 ; BTVER2:       # BB#0:
@@ -472,8 +472,8 @@
 ; HASWELL-LABEL: test_blendvps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendvps:
 ; BTVER2:       # BB#0:
@@ -506,8 +506,8 @@
 ;
 ; HASWELL-LABEL: test_broadcastf128:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastf128:
 ; BTVER2:       # BB#0:
@@ -536,8 +536,8 @@
 ;
 ; HASWELL-LABEL: test_broadcastsd_ymm:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastsd_ymm:
 ; BTVER2:       # BB#0:
@@ -567,8 +567,8 @@
 ;
 ; HASWELL-LABEL: test_broadcastss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastss:
 ; BTVER2:       # BB#0:
@@ -598,8 +598,8 @@
 ;
 ; HASWELL-LABEL: test_broadcastss_ymm:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastss_ymm:
 ; BTVER2:       # BB#0:
@@ -634,9 +634,9 @@
 ; HASWELL-LABEL: test_cmppd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmppd:
 ; BTVER2:       # BB#0:
@@ -679,9 +679,9 @@
 ; HASWELL-LABEL: test_cmpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmpps:
 ; BTVER2:       # BB#0:
@@ -724,9 +724,9 @@
 ; HASWELL-LABEL: test_cvtdq2pd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00]
-; HASWELL-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [6:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2pd:
 ; BTVER2:       # BB#0:
@@ -767,10 +767,10 @@
 ;
 ; HASWELL-LABEL: test_cvtdq2ps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2ps:
 ; BTVER2:       # BB#0:
@@ -810,9 +810,9 @@
 ; HASWELL-LABEL: test_cvtpd2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00]
-; HASWELL-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00]
+; HASWELL-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2dq:
 ; BTVER2:       # BB#0:
@@ -851,10 +851,10 @@
 ;
 ; HASWELL-LABEL: test_cvtpd2ps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2ps:
 ; BTVER2:       # BB#0:
@@ -894,9 +894,9 @@
 ; HASWELL-LABEL: test_cvtps2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2dq:
 ; BTVER2:       # BB#0:
@@ -933,9 +933,9 @@
 ;
 ; HASWELL-LABEL: test_divpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00]
-; HASWELL-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divpd:
 ; BTVER2:       # BB#0:
@@ -969,9 +969,9 @@
 ;
 ; HASWELL-LABEL: test_divps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divps:
 ; BTVER2:       # BB#0:
@@ -1006,8 +1006,8 @@
 ; HASWELL-LABEL: test_dpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00]
-; HASWELL-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_dpps:
 ; BTVER2:       # BB#0:
@@ -1045,9 +1045,9 @@
 ; HASWELL-LABEL: test_extractf128:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [1:?]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_extractf128:
 ; BTVER2:       # BB#0:
@@ -1083,8 +1083,8 @@
 ; HASWELL-LABEL: test_haddpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # BB#0:
@@ -1120,8 +1120,8 @@
 ; HASWELL-LABEL: test_haddps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # BB#0:
@@ -1157,8 +1157,8 @@
 ; HASWELL-LABEL: test_hsubpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # BB#0:
@@ -1194,8 +1194,8 @@
 ; HASWELL-LABEL: test_hsubps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # BB#0:
@@ -1233,9 +1233,9 @@
 ; HASWELL-LABEL: test_insertf128:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_insertf128:
 ; BTVER2:       # BB#0:
@@ -1272,8 +1272,8 @@
 ;
 ; HASWELL-LABEL: test_lddqu:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vlddqu (%rdi), %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lddqu:
 ; BTVER2:       # BB#0:
@@ -1306,10 +1306,10 @@
 ;
 ; HASWELL-LABEL: test_maskmovpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [2:2.00]
+; HASWELL-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:1.00]
 ; HASWELL-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovpd:
 ; BTVER2:       # BB#0:
@@ -1348,10 +1348,10 @@
 ;
 ; HASWELL-LABEL: test_maskmovpd_ymm:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [2:2.00]
+; HASWELL-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [4:1.00]
 ; HASWELL-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovpd_ymm:
 ; BTVER2:       # BB#0:
@@ -1390,10 +1390,10 @@
 ;
 ; HASWELL-LABEL: test_maskmovps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [2:2.00]
+; HASWELL-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:1.00]
 ; HASWELL-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovps:
 ; BTVER2:       # BB#0:
@@ -1432,10 +1432,10 @@
 ;
 ; HASWELL-LABEL: test_maskmovps_ymm:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [2:2.00]
+; HASWELL-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [4:1.00]
 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovps_ymm:
 ; BTVER2:       # BB#0:
@@ -1473,8 +1473,8 @@
 ; HASWELL-LABEL: test_maxpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxpd:
 ; BTVER2:       # BB#0:
@@ -1510,8 +1510,8 @@
 ; HASWELL-LABEL: test_maxps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxps:
 ; BTVER2:       # BB#0:
@@ -1547,8 +1547,8 @@
 ; HASWELL-LABEL: test_minpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minpd:
 ; BTVER2:       # BB#0:
@@ -1584,8 +1584,8 @@
 ; HASWELL-LABEL: test_minps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minps:
 ; BTVER2:       # BB#0:
@@ -1622,10 +1622,10 @@
 ;
 ; HASWELL-LABEL: test_movapd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovapd (%rdi), %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movapd:
 ; BTVER2:       # BB#0:
@@ -1663,10 +1663,10 @@
 ;
 ; HASWELL-LABEL: test_movaps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovaps (%rdi), %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movaps:
 ; BTVER2:       # BB#0:
@@ -1705,9 +1705,9 @@
 ; HASWELL-LABEL: test_movddup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [1:0.50]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movddup:
 ; BTVER2:       # BB#0:
@@ -1744,9 +1744,9 @@
 ;
 ; HASWELL-LABEL: test_movmskpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [1:?]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovmskpd %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movmskpd:
 ; BTVER2:       # BB#0:
@@ -1778,9 +1778,9 @@
 ;
 ; HASWELL-LABEL: test_movmskps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovmskps %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [1:?]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovmskps %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movmskps:
 ; BTVER2:       # BB#0:
@@ -1814,7 +1814,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntpd:
 ; BTVER2:       # BB#0:
@@ -1849,7 +1849,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntps:
 ; BTVER2:       # BB#0:
@@ -1885,9 +1885,9 @@
 ; HASWELL-LABEL: test_movshdup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; HASWELL-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; HASWELL-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movshdup:
 ; BTVER2:       # BB#0:
@@ -1927,9 +1927,9 @@
 ; HASWELL-LABEL: test_movsldup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; HASWELL-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; HASWELL-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movsldup:
 ; BTVER2:       # BB#0:
@@ -1970,10 +1970,10 @@
 ;
 ; HASWELL-LABEL: test_movupd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovupd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovupd (%rdi), %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movupd:
 ; BTVER2:       # BB#0:
@@ -2013,10 +2013,10 @@
 ;
 ; HASWELL-LABEL: test_movups:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovups (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovups (%rdi), %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movups:
 ; BTVER2:       # BB#0:
@@ -2052,9 +2052,9 @@
 ;
 ; HASWELL-LABEL: test_mulpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulpd:
 ; BTVER2:       # BB#0:
@@ -2088,9 +2088,9 @@
 ;
 ; HASWELL-LABEL: test_mulps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulps:
 ; BTVER2:       # BB#0:
@@ -2127,9 +2127,9 @@
 ; HASWELL-LABEL: orpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: orpd:
 ; BTVER2:       # BB#0:
@@ -2173,9 +2173,9 @@
 ; HASWELL-LABEL: test_orps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_orps:
 ; BTVER2:       # BB#0:
@@ -2219,9 +2219,9 @@
 ; HASWELL-LABEL: test_permilpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; HASWELL-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; HASWELL-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilpd:
 ; BTVER2:       # BB#0:
@@ -2261,9 +2261,9 @@
 ; HASWELL-LABEL: test_permilpd_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
-; HASWELL-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
+; HASWELL-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilpd_ymm:
 ; BTVER2:       # BB#0:
@@ -2303,9 +2303,9 @@
 ; HASWELL-LABEL: test_permilps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; HASWELL-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilps:
 ; BTVER2:       # BB#0:
@@ -2345,9 +2345,9 @@
 ; HASWELL-LABEL: test_permilps_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
-; HASWELL-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilps_ymm:
 ; BTVER2:       # BB#0:
@@ -2385,8 +2385,8 @@
 ; HASWELL-LABEL: test_permilvarpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarpd:
 ; BTVER2:       # BB#0:
@@ -2422,8 +2422,8 @@
 ; HASWELL-LABEL: test_permilvarpd_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarpd_ymm:
 ; BTVER2:       # BB#0:
@@ -2459,8 +2459,8 @@
 ; HASWELL-LABEL: test_permilvarps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarps:
 ; BTVER2:       # BB#0:
@@ -2496,8 +2496,8 @@
 ; HASWELL-LABEL: test_permilvarps_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarps_ymm:
 ; BTVER2:       # BB#0:
@@ -2535,9 +2535,9 @@
 ; HASWELL-LABEL: test_rcpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rcpps:
 ; BTVER2:       # BB#0:
@@ -2577,10 +2577,10 @@
 ;
 ; HASWELL-LABEL: test_roundpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [5:1.25]
+; HASWELL-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [6:2.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_roundpd:
 ; BTVER2:       # BB#0:
@@ -2620,10 +2620,10 @@
 ;
 ; HASWELL-LABEL: test_roundps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [5:1.25]
+; HASWELL-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [6:2.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_roundps:
 ; BTVER2:       # BB#0:
@@ -2664,9 +2664,9 @@
 ; HASWELL-LABEL: test_rsqrtps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [11:2.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtps:
 ; BTVER2:       # BB#0:
@@ -2707,9 +2707,9 @@
 ; HASWELL-LABEL: test_shufpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; HASWELL-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_shufpd:
 ; BTVER2:       # BB#0:
@@ -2747,8 +2747,8 @@
 ; HASWELL-LABEL: test_shufps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; HASWELL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_shufps:
 ; BTVER2:       # BB#0:
@@ -2784,10 +2784,10 @@
 ;
 ; HASWELL-LABEL: test_sqrtpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [32:2.00]
-; HASWELL-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [28:2.00]
+; HASWELL-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [35:2.00]
+; HASWELL-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [35:2.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtpd:
 ; BTVER2:       # BB#0:
@@ -2827,10 +2827,10 @@
 ;
 ; HASWELL-LABEL: test_sqrtps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [23:2.00]
-; HASWELL-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [19:2.00]
+; HASWELL-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [21:2.00]
+; HASWELL-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [21:2.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtps:
 ; BTVER2:       # BB#0:
@@ -2869,8 +2869,8 @@
 ; HASWELL-LABEL: test_subpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subpd:
 ; BTVER2:       # BB#0:
@@ -2905,8 +2905,8 @@
 ; HASWELL-LABEL: test_subps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subps:
 ; BTVER2:       # BB#0:
@@ -2947,11 +2947,11 @@
 ; HASWELL-LABEL: test_testpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    setb %al # sched: [1:0.50]
-; HASWELL-NEXT:    vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vtestpd (%rdi), %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_testpd:
 ; BTVER2:       # BB#0:
@@ -3002,12 +3002,12 @@
 ; HASWELL-LABEL: test_testpd_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    setb %al # sched: [1:0.50]
-; HASWELL-NEXT:    vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vtestpd (%rdi), %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    vzeroupper # sched: [1:?]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_testpd_ymm:
 ; BTVER2:       # BB#0:
@@ -3057,11 +3057,11 @@
 ; HASWELL-LABEL: test_testps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    setb %al # sched: [1:0.50]
-; HASWELL-NEXT:    vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vtestps (%rdi), %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_testps:
 ; BTVER2:       # BB#0:
@@ -3112,12 +3112,12 @@
 ; HASWELL-LABEL: test_testps_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    setb %al # sched: [1:0.50]
-; HASWELL-NEXT:    vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vtestps (%rdi), %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    vzeroupper # sched: [1:?]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_testps_ymm:
 ; BTVER2:       # BB#0:
@@ -3163,9 +3163,9 @@
 ; HASWELL-LABEL: test_unpckhpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhpd:
 ; BTVER2:       # BB#0:
@@ -3203,8 +3203,8 @@
 ; HASWELL-LABEL: test_unpckhps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; HASWELL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhps:
 ; BTVER2:       # BB#0:
@@ -3241,9 +3241,9 @@
 ; HASWELL-LABEL: test_unpcklpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; HASWELL-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; HASWELL-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklpd:
 ; BTVER2:       # BB#0:
@@ -3281,8 +3281,8 @@
 ; HASWELL-LABEL: test_unpcklps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; HASWELL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklps:
 ; BTVER2:       # BB#0:
@@ -3319,9 +3319,9 @@
 ; HASWELL-LABEL: test_xorpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_xorpd:
 ; BTVER2:       # BB#0:
@@ -3365,9 +3365,9 @@
 ; HASWELL-LABEL: test_xorps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_xorps:
 ; BTVER2:       # BB#0:
@@ -3406,8 +3406,8 @@
 ;
 ; HASWELL-LABEL: test_zeroall:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vzeroall # sched: [1:?]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vzeroall # sched: [16:16.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_zeroall:
 ; BTVER2:       # BB#0:
@@ -3436,8 +3436,8 @@
 ;
 ; HASWELL-LABEL: test_zeroupper:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vzeroupper # sched: [1:?]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_zeroupper:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll
index 971ebe5..7dbe2dd 100644
--- a/llvm/test/CodeGen/X86/avx2-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx2-schedule.ll
@@ -15,9 +15,9 @@
 ; HASWELL-LABEL: test_pabsb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpabsb (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpabsb (%rdi), %ymm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pabsb:
 ; ZNVER1:       # BB#0:
@@ -44,9 +44,9 @@
 ; HASWELL-LABEL: test_pabsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpabsd (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpabsd (%rdi), %ymm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pabsd:
 ; ZNVER1:       # BB#0:
@@ -73,9 +73,9 @@
 ; HASWELL-LABEL: test_pabsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpabsw (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpabsw (%rdi), %ymm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pabsw:
 ; ZNVER1:       # BB#0:
@@ -101,8 +101,8 @@
 ; HASWELL-LABEL: test_paddb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_paddb:
 ; ZNVER1:       # BB#0:
@@ -125,8 +125,8 @@
 ; HASWELL-LABEL: test_paddd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_paddd:
 ; ZNVER1:       # BB#0:
@@ -149,8 +149,8 @@
 ; HASWELL-LABEL: test_paddq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_paddq:
 ; ZNVER1:       # BB#0:
@@ -173,8 +173,8 @@
 ; HASWELL-LABEL: test_paddw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_paddw:
 ; ZNVER1:       # BB#0:
@@ -198,9 +198,9 @@
 ; HASWELL-LABEL: test_pand:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pand:
 ; ZNVER1:       # BB#0:
@@ -226,9 +226,9 @@
 ; HASWELL-LABEL: test_pandn:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pandn:
 ; ZNVER1:       # BB#0:
@@ -256,7 +256,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
 ; HASWELL-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pmulld:
 ; ZNVER1:       # BB#0:
@@ -279,8 +279,8 @@
 ; HASWELL-LABEL: test_pmullw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pmullw:
 ; ZNVER1:       # BB#0:
@@ -304,9 +304,9 @@
 ; HASWELL-LABEL: test_por:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_por:
 ; ZNVER1:       # BB#0:
@@ -331,8 +331,8 @@
 ; HASWELL-LABEL: test_psubb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_psubb:
 ; ZNVER1:       # BB#0:
@@ -355,8 +355,8 @@
 ; HASWELL-LABEL: test_psubd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_psubd:
 ; ZNVER1:       # BB#0:
@@ -379,8 +379,8 @@
 ; HASWELL-LABEL: test_psubq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_psubq:
 ; ZNVER1:       # BB#0:
@@ -403,8 +403,8 @@
 ; HASWELL-LABEL: test_psubw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_psubw:
 ; ZNVER1:       # BB#0:
@@ -428,9 +428,9 @@
 ; HASWELL-LABEL: test_pxor:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pxor:
 ; ZNVER1:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index b5a1340..d2e95f6 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -126,11 +126,11 @@
 define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
 ; ALL-LABEL: test8:
 ; ALL:       ## BB#0:
-; ALL-NEXT:    notl %edi
 ; ALL-NEXT:    xorl $-2147483648, %esi ## imm = 0x80000000
 ; ALL-NEXT:    testl %edx, %edx
 ; ALL-NEXT:    movl $1, %eax
 ; ALL-NEXT:    cmovel %eax, %edx
+; ALL-NEXT:    notl %edi
 ; ALL-NEXT:    orl %edi, %esi
 ; ALL-NEXT:    cmovnel %edx, %eax
 ; ALL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index bc53933..25559dd 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -1530,19 +1530,19 @@
 }
 
 define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
-; NOVL-LABEL: uitofp_2i1_float:
-; NOVL:       # BB#0:
-; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; NOVL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    vpextrb $8, %xmm0, %eax
-; NOVL-NEXT:    andl $1, %eax
-; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
-; NOVL-NEXT:    vpextrb $0, %xmm0, %eax
-; NOVL-NEXT:    andl $1, %eax
-; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
-; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; NOVL-NEXT:    retq
+; KNL-LABEL: uitofp_2i1_float:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpextrb $8, %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
+; KNL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; KNL-NEXT:    andl $1, %ecx
+; KNL-NEXT:    vcvtsi2ssl %ecx, %xmm2, %xmm1
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; KNL-NEXT:    retq
 ;
 ; VL-LABEL: uitofp_2i1_float:
 ; VL:       # BB#0:
@@ -1552,6 +1552,34 @@
 ; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
 ; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
 ; VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_2i1_float:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512DQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
+; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; AVX512DQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: uitofp_2i1_float:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512BW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512BW-NEXT:    retq
   %mask = icmp ult <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x float>
   ret <2 x float> %1
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 1a76a9f..50e8484 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -48,8 +48,8 @@
 define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_16x8mem_to_16x16:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -70,8 +70,8 @@
 define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_16x8mem_to_16x16:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; KNL-NEXT:    vpmovsxbw (%rdi), %ymm1
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index eef49ed..122c118 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -936,7 +936,6 @@
 ; KNL-NEXT:    subq $32, %rsp
 ; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
@@ -1062,6 +1061,7 @@
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
@@ -1112,23 +1112,23 @@
 ; KNL-LABEL: test_iinsertelement_v4i1:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpextrb $4, %xmm0, %ecx
-; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    vpextrb $4, %xmm0, %eax
+; KNL-NEXT:    setb %cl
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
-; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    vpextrb $0, %xmm0, %eax
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
 ; KNL-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
 ; KNL-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
@@ -1902,14 +1902,23 @@
 }
 
 define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
-; CHECK-LABEL: test_extractelement_variable_v16i8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movb (%rdi,%rax), %al
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_extractelement_variable_v16i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; KNL-NEXT:    movb (%rdi,%rax), %al
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v16i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; SKX-NEXT:    movb (%rdi,%rax), %al
+; SKX-NEXT:    retq
   %t2 = extractelement <16 x i8> %t1, i32 %index
   ret i8 %t2
 }
@@ -1927,8 +1936,8 @@
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $64, %rsp
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $31, %edi
 ; KNL-NEXT:    movq %rsp, %rax
 ; KNL-NEXT:    movb (%rdi,%rax), %al
@@ -1975,9 +1984,9 @@
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-64, %rsp
 ; KNL-NEXT:    subq $128, %rsp
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $63, %edi
 ; KNL-NEXT:    movq %rsp, %rax
 ; KNL-NEXT:    movb (%rdi,%rax), %al
@@ -2066,12 +2075,12 @@
 define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v2i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    movl -24(%rsp,%rdi,8), %eax
 ; KNL-NEXT:    andl $1, %eax
@@ -2096,12 +2105,12 @@
 define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v4i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $3, %edi
 ; KNL-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; KNL-NEXT:    andl $1, %eax
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 95679dc..afb463d 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -2880,7 +2880,6 @@
 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
 ; CHECK-LABEL: test_mask_vextractf32x4:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
 ; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k1
 ; CHECK-NEXT:    kshiftrw $15, %k1, %k1
@@ -2898,6 +2897,7 @@
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
 ; CHECK-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
@@ -2941,7 +2941,6 @@
 define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
 ; CHECK-LABEL: test_maskz_vextracti32x4:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
 ; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k1
 ; CHECK-NEXT:    kshiftrw $15, %k1, %k1
@@ -2959,6 +2958,7 @@
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
 ; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 1ae57c6..f6d752d 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1835,73 +1835,8 @@
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $32, %rsp
-; KNL-NEXT:    vmovups (%rdi), %zmm2
-; KNL-NEXT:    vmovups 64(%rdi), %zmm3
-; KNL-NEXT:    vcmpltps %zmm1, %zmm3, %k1
-; KNL-NEXT:    kshiftlw $14, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    kshiftlw $15, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm3
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $13, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $12, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $11, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $10, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $9, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $8, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $7, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $6, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $5, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $4, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $3, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $2, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $1, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftrw $15, %k1, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT:    vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT:    vmovups 64(%rdi), %zmm2
+; KNL-NEXT:    vcmpltps %zmm1, %zmm2, %k2
 ; KNL-NEXT:    kshiftlw $14, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1965,138 +1900,203 @@
 ; KNL-NEXT:    kshiftrw $15, %k2, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; KNL-NEXT:    vmovups 4(%rdi), %zmm3 {%k2} {z}
-; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k1} {z}
+; KNL-NEXT:    vmovups (%rdi), %zmm3
+; KNL-NEXT:    vcmpltps %zmm0, %zmm3, %k1
+; KNL-NEXT:    kshiftlw $14, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $15, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vmovd %ecx, %xmm3
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $13, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $12, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $11, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $10, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $9, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $8, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $7, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $6, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $5, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $4, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $3, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $2, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $1, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftrw $15, %k1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k2} {z}
 ; KNL-NEXT:    vcmpltps %zmm4, %zmm1, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kshiftlw $14, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $15, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %ecx
 ; KNL-NEXT:    vmovd %ecx, %xmm4
 ; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $13, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $13, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $12, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $12, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $11, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $11, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $10, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $10, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $9, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $9, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $8, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $8, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $7, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $7, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $6, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $6, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $5, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $5, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $4, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $4, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $3, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $3, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $2, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $2, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $1, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $1, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT:    vcmpltps %zmm3, %zmm0, %k0
+; KNL-NEXT:    vmovups 4(%rdi), %zmm5 {%k1} {z}
+; KNL-NEXT:    vcmpltps %zmm5, %zmm0, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm3
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vmovd %ecx, %xmm5
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; KNL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm3
 ; KNL-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; KNL-NEXT:    vpmovsxbd %xmm3, %zmm3
@@ -2941,36 +2941,6 @@
 ;
 ; KNL-LABEL: store_64i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:  Lcfi9:
-; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:    pushq %r15
-; KNL-NEXT:  Lcfi10:
-; KNL-NEXT:    .cfi_def_cfa_offset 24
-; KNL-NEXT:    pushq %r14
-; KNL-NEXT:  Lcfi11:
-; KNL-NEXT:    .cfi_def_cfa_offset 32
-; KNL-NEXT:    pushq %r13
-; KNL-NEXT:  Lcfi12:
-; KNL-NEXT:    .cfi_def_cfa_offset 40
-; KNL-NEXT:    pushq %r12
-; KNL-NEXT:  Lcfi13:
-; KNL-NEXT:    .cfi_def_cfa_offset 48
-; KNL-NEXT:    pushq %rbx
-; KNL-NEXT:  Lcfi14:
-; KNL-NEXT:    .cfi_def_cfa_offset 56
-; KNL-NEXT:  Lcfi15:
-; KNL-NEXT:    .cfi_offset %rbx, -56
-; KNL-NEXT:  Lcfi16:
-; KNL-NEXT:    .cfi_offset %r12, -48
-; KNL-NEXT:  Lcfi17:
-; KNL-NEXT:    .cfi_offset %r13, -40
-; KNL-NEXT:  Lcfi18:
-; KNL-NEXT:    .cfi_offset %r14, -32
-; KNL-NEXT:  Lcfi19:
-; KNL-NEXT:    .cfi_offset %r15, -24
-; KNL-NEXT:  Lcfi20:
-; KNL-NEXT:    .cfi_offset %rbp, -16
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -2982,66 +2952,66 @@
 ; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    vmovd %ecx, %xmm3
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r14d
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
+; KNL-NEXT:    vpinsrb $2, %edx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
+; KNL-NEXT:    vpinsrb $5, %edx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    vpinsrb $8, %edx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
+; KNL-NEXT:    vpinsrb $11, %edx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %r9d, %xmm3
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k2
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm3, %xmm2
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $1, %r8d, %xmm3, %xmm2
-; KNL-NEXT:    vpinsrb $2, %r10d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $4, %r14d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $5, %r15d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $6, %r12d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $7, %r13d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $8, %ebx, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $9, %ebp, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $13, %esi, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $14, %r9d, %xmm2, %xmm2
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
 ; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
@@ -3050,66 +3020,66 @@
 ; KNL-NEXT:    kmovw %k0, 6(%rdi)
 ; KNL-NEXT:    kshiftlw $14, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $15, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $13, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $12, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    vmovd %ecx, %xmm2
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $11, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r14d
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $10, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r15d
+; KNL-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $9, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $8, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r13d
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $7, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $6, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %esi
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $5, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $4, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    vpinsrb $8, %edx, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $3, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $2, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $1, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %r10d, %xmm2
-; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    vpinsrb $11, %edx, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL-NEXT:    kshiftrw $15, %k2, %k0
-; KNL-NEXT:    vpinsrb $1, %r8d, %xmm2, %xmm1
-; KNL-NEXT:    vpinsrb $2, %r9d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $4, %r14d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $5, %r15d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $6, %r12d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $7, %r13d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $9, %esi, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $10, %ebp, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $11, %ebx, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $14, %r10d, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm1
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -3118,145 +3088,139 @@
 ; KNL-NEXT:    kmovw %k0, 4(%rdi)
 ; KNL-NEXT:    kshiftlw $14, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $15, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $13, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $12, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    vmovd %ecx, %xmm1
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $11, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r14d
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $10, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r15d
+; KNL-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $9, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $8, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r13d
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $7, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vpinsrb $5, %edx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $6, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %esi
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $5, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $4, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    vpinsrb $8, %edx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $3, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $2, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $1, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %r10d, %xmm1
-; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    vpinsrb $11, %edx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm0
-; KNL-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $14, %r10d, %xmm0, %xmm0
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    kmovw %k1, 2(%rdi)
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    vpinsrb $14, %edx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r11d
-; KNL-NEXT:    kshiftlw $11, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r14d
-; KNL-NEXT:    kshiftlw $10, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r15d
-; KNL-NEXT:    kshiftlw $9, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r12d
-; KNL-NEXT:    kshiftlw $8, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r13d
-; KNL-NEXT:    kshiftlw $7, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vmovd %edx, %xmm1
 ; KNL-NEXT:    kmovw %k1, %edx
-; KNL-NEXT:    kshiftlw $6, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %esi
-; KNL-NEXT:    kshiftlw $5, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebp
-; KNL-NEXT:    kshiftlw $4, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebx
-; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    kshiftlw $11, %k0, %k1
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    kshiftlw $10, %k0, %k1
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    kshiftlw $1, %k0, %k1
+; KNL-NEXT:    kshiftlw $9, %k0, %k1
+; KNL-NEXT:    vpinsrb $3, %edx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %r9d, %xmm0
-; KNL-NEXT:    kmovw %k1, %r9d
-; KNL-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $2, %r10d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kshiftlw $8, %k0, %k1
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $7, %k0, %k1
+; KNL-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kshiftlw $6, %k0, %k1
+; KNL-NEXT:    vpinsrb $6, %edx, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kshiftlw $5, %k0, %k1
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $4, %k0, %k1
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    vpinsrb $9, %edx, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $1, %k0, %k1
+; KNL-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $12, %edx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $14, %r9d, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm0
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k0, 2(%rdi)
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
-; KNL-NEXT:    popq %rbx
-; KNL-NEXT:    popq %r12
-; KNL-NEXT:    popq %r13
-; KNL-NEXT:    popq %r14
-; KNL-NEXT:    popq %r15
-; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_64i1:
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index b3fbcee..3406fef 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -269,8 +269,6 @@
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
@@ -327,11 +325,13 @@
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -577,75 +577,75 @@
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    vpcmpeqw %ymm6, %ymm2, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    vpcmpeqw %ymm6, %ymm2, %ymm1
+; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm0
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vmovd %ecx, %xmm1
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
+; KNL-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index c6e1dbd..a788468 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1685,8 +1685,6 @@
 ; AVX512F-32-NEXT:    .cfi_offset %esi, -12
 ; AVX512F-32-NEXT:  .Lcfi9:
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -8
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $5, %al
@@ -1707,39 +1705,39 @@
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    kmovd %ecx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %ebx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $6, %al
@@ -1748,8 +1746,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
@@ -1758,8 +1756,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %al
@@ -1767,8 +1765,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
@@ -1777,8 +1775,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %dl
@@ -1789,8 +1787,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
@@ -1798,8 +1796,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1809,8 +1807,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1820,8 +1818,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1831,8 +1829,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1842,8 +1840,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1852,8 +1850,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1864,8 +1862,8 @@
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %bl
@@ -1877,8 +1875,8 @@
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %dl
@@ -1887,8 +1885,8 @@
 ; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1898,8 +1896,8 @@
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1910,8 +1908,8 @@
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1921,8 +1919,8 @@
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -1932,19 +1930,390 @@
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $24, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm3, %k0
+; AVX512F-32-NEXT:    movb %al, %dl
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm4, %k0
+; AVX512F-32-NEXT:    movb %al, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movb %dl, %al
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
+; AVX512F-32-NEXT:    shrb $3, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $28, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT:    vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm7, %ymm5, %ymm5
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    movl %ecx, %esi
+; AVX512F-32-NEXT:    shrl $29, %eax
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm7
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %esi, %eax
+; AVX512F-32-NEXT:    shrl $30, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %esi, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    andb $2, %al
+; AVX512F-32-NEXT:    shrb %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movb %dl, %al
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    shrb $3, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    shrb $4, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    shrb $6, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %ch, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    andb $2, %al
+; AVX512F-32-NEXT:    shrb %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %ch, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movb %dl, %al
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    shrb $3, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    andl $61440, %eax # imm = 0xF000
+; AVX512F-32-NEXT:    shrl $12, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $13, %eax
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    andl $49152, %eax # imm = 0xC000
+; AVX512F-32-NEXT:    shrl $14, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    andl $32768, %eax # imm = 0x8000
+; AVX512F-32-NEXT:    shrl $15, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %ebx
+; AVX512F-32-NEXT:    shrl $16, %ebx
+; AVX512F-32-NEXT:    kmovd %ebx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %bl, %dl
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    movb %bl, %al
+; AVX512F-32-NEXT:    andb $15, %al
+; AVX512F-32-NEXT:    movb %al, %dl
+; AVX512F-32-NEXT:    shrb $2, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k0
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    shrb $3, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %bl, %al
+; AVX512F-32-NEXT:    shrb $4, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %bl, %al
+; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %bl, %al
+; AVX512F-32-NEXT:    shrb $6, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
+; AVX512F-32-NEXT:    shrb $7, %bl
+; AVX512F-32-NEXT:    kmovd %ebx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $24, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $2, %dl
@@ -1954,442 +2323,71 @@
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %al, %dl
-; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movb %dl, %al
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    movl %ecx, %esi
-; AVX512F-32-NEXT:    shrl $29, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %esi, %eax
-; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %esi, %eax
-; AVX512F-32-NEXT:    shrl $31, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %dl
-; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movb %dl, %al
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    shrb $7, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %ch, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %ch, %dl
-; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movb %dl, %al
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andl $61440, %eax # imm = 0xF000
-; AVX512F-32-NEXT:    shrl $12, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $13, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andl $49152, %eax # imm = 0xC000
-; AVX512F-32-NEXT:    shrl $14, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andl $32768, %eax # imm = 0x8000
-; AVX512F-32-NEXT:    shrl $15, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %ebx
-; AVX512F-32-NEXT:    shrl $16, %ebx
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %bl, %dl
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    movb %bl, %al
-; AVX512F-32-NEXT:    andb $15, %al
-; AVX512F-32-NEXT:    movb %al, %dl
-; AVX512F-32-NEXT:    shrb $2, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %bl, %al
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %bl, %al
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %bl, %al
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
-; AVX512F-32-NEXT:    shrb $7, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $24, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %al, %dl
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kshiftlq $1, %k0, %k0
@@ -2397,29 +2395,29 @@
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512F-32-NEXT:    korq %k1, %k0, %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT:    vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
-; AVX512F-32-NEXT:    vpcmpleb %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT:    vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT:    vpcmpleb %zmm5, %zmm6, %k5 {%k1}
-; AVX512F-32-NEXT:    vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k3, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kxorq %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq %k4, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpleb %zmm0, %zmm1, %k2 {%k1}
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k5, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kmovq %k1, {{[0-9]+}}(%esp)
@@ -2571,8 +2569,6 @@
 ; AVX512F-32-NEXT:    .cfi_offset %esi, -12
 ; AVX512F-32-NEXT:  .Lcfi15:
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -8
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $5, %al
@@ -2593,39 +2589,39 @@
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    kmovd %ecx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %ebx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $6, %al
@@ -2634,8 +2630,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
@@ -2644,8 +2640,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %al
@@ -2653,8 +2649,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
@@ -2663,8 +2659,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %dl
@@ -2675,8 +2671,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
@@ -2684,8 +2680,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2695,8 +2691,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2706,8 +2702,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2717,8 +2713,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2728,8 +2724,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2738,8 +2734,8 @@
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2750,8 +2746,8 @@
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %bl
@@ -2763,8 +2759,8 @@
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %dl
@@ -2773,8 +2769,8 @@
 ; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2784,8 +2780,8 @@
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2796,8 +2792,8 @@
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2807,8 +2803,8 @@
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -2818,19 +2814,390 @@
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $24, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm3, %k0
+; AVX512F-32-NEXT:    movb %al, %dl
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm4, %k0
+; AVX512F-32-NEXT:    movb %al, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movb %dl, %al
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
+; AVX512F-32-NEXT:    shrb $3, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $28, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT:    vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm7, %ymm5, %ymm5
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    movl %ecx, %esi
+; AVX512F-32-NEXT:    shrl $29, %eax
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm7
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %esi, %eax
+; AVX512F-32-NEXT:    shrl $30, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %esi, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    andb $2, %al
+; AVX512F-32-NEXT:    shrb %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movb %dl, %al
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    shrb $3, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    shrb $4, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    shrb $6, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %cl, %al
+; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %ch, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    andb $2, %al
+; AVX512F-32-NEXT:    shrb %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %ch, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movb %dl, %al
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    shrb $3, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    andl $61440, %eax # imm = 0xF000
+; AVX512F-32-NEXT:    shrl $12, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $13, %eax
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    andl $49152, %eax # imm = 0xC000
+; AVX512F-32-NEXT:    shrl $14, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    andl $32768, %eax # imm = 0x8000
+; AVX512F-32-NEXT:    shrl $15, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %ebx
+; AVX512F-32-NEXT:    shrl $16, %ebx
+; AVX512F-32-NEXT:    kmovd %ebx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %bl, %dl
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    movb %bl, %al
+; AVX512F-32-NEXT:    andb $15, %al
+; AVX512F-32-NEXT:    movb %al, %dl
+; AVX512F-32-NEXT:    shrb $2, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k0
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    shrb $3, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %bl, %al
+; AVX512F-32-NEXT:    shrb $4, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %bl, %al
+; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movb %bl, %al
+; AVX512F-32-NEXT:    shrb $6, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
+; AVX512F-32-NEXT:    shrb $7, %bl
+; AVX512F-32-NEXT:    kmovd %ebx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $24, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $2, %dl
@@ -2840,442 +3207,71 @@
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %al, %dl
-; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movb %dl, %al
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    movl %ecx, %esi
-; AVX512F-32-NEXT:    shrl $29, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %esi, %eax
-; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %esi, %eax
-; AVX512F-32-NEXT:    shrl $31, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %dl
-; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movb %dl, %al
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %cl, %al
-; AVX512F-32-NEXT:    shrb $7, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %ch, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %ch, %dl
-; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movb %dl, %al
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andl $61440, %eax # imm = 0xF000
-; AVX512F-32-NEXT:    shrl $12, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $13, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andl $49152, %eax # imm = 0xC000
-; AVX512F-32-NEXT:    shrl $14, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andl $32768, %eax # imm = 0x8000
-; AVX512F-32-NEXT:    shrl $15, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %ebx
-; AVX512F-32-NEXT:    shrl $16, %ebx
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %bl, %dl
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    movb %bl, %al
-; AVX512F-32-NEXT:    andb $15, %al
-; AVX512F-32-NEXT:    movb %al, %dl
-; AVX512F-32-NEXT:    shrb $2, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %bl, %al
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %bl, %al
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %bl, %al
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
-; AVX512F-32-NEXT:    shrb $7, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $24, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %al, %dl
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kshiftlq $1, %k0, %k0
@@ -3283,29 +3279,29 @@
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512F-32-NEXT:    korq %k1, %k0, %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT:    vpcmpltub %zmm6, %zmm5, %k2 {%k1}
-; AVX512F-32-NEXT:    vpcmpleub %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT:    vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT:    vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
-; AVX512F-32-NEXT:    vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k3, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kxorq %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq %k4, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k2 {%k1}
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k5, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kmovq %k1, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index f4504ed..19dd83f 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -2750,23 +2750,23 @@
 ; CHECK-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
 ; CHECK-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
-; CHECK-NEXT:    kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
-; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
-; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
-; CHECK-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
+; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT:    vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
 ; CHECK-NEXT:    vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
 ; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
@@ -2848,23 +2848,23 @@
 ; CHECK-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
 ; CHECK-NEXT:    kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
 ; CHECK-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
-; CHECK-NEXT:    kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
-; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
-; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
-; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
 ; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
-; CHECK-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
+; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
 ; CHECK-NEXT:    vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
 ; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index 482a054..fb297a9 100644
--- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -6,7 +6,6 @@
 define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm0
 ; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    kshiftlb $7, %k0, %k1
 ; CHECK-NEXT:    kshiftrb $7, %k1, %k1
@@ -16,6 +15,7 @@
 ; CHECK-NEXT:    kmovw %k1, %ecx
 ; CHECK-NEXT:    vmovd %ecx, %xmm2
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm0
 ; CHECK-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; CHECK-NEXT:    vpsraq $63, %zmm2, %zmm2
 ; CHECK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll
index 43b1f53..e7d8c88 100644
--- a/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -314,8 +314,8 @@
 ;
 ; NoVLX-LABEL: test256_11:
 ; NoVLX:       # BB#0:
-; NoVLX-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm3
+; NoVLX-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
 ; NoVLX-NEXT:    vpand %ymm2, %ymm3, %ymm2
 ; NoVLX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    retq
@@ -824,8 +824,8 @@
 ;
 ; NoVLX-LABEL: test128_11:
 ; NoVLX:       # BB#0:
-; NoVLX-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm3
+; NoVLX-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
 ; NoVLX-NEXT:    vpand %xmm2, %xmm3, %xmm2
 ; NoVLX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index c8a1682..f0d23e4 100644
--- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -19,23 +19,8 @@
 ; NoVLX-NEXT:    movq %rsp, %rbp
 ; NoVLX-NEXT:  .Lcfi2:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi3:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi4:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi5:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi6:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi7:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -44,64 +29,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -110,12 +95,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -137,30 +117,15 @@
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi8:
+; NoVLX-NEXT:  .Lcfi3:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi9:
+; NoVLX-NEXT:  .Lcfi4:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi10:
+; NoVLX-NEXT:  .Lcfi5:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi11:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi12:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi13:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi14:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi15:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -169,64 +134,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -235,12 +200,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -264,30 +224,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi16:
+; NoVLX-NEXT:  .Lcfi6:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi17:
+; NoVLX-NEXT:  .Lcfi7:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi18:
+; NoVLX-NEXT:  .Lcfi8:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi19:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi20:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi21:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi22:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi23:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -297,64 +242,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -363,12 +308,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -393,30 +333,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi24:
+; NoVLX-NEXT:  .Lcfi9:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi25:
+; NoVLX-NEXT:  .Lcfi10:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi26:
+; NoVLX-NEXT:  .Lcfi11:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi27:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi28:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi29:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi30:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi31:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -426,64 +351,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -492,12 +417,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -523,12 +443,12 @@
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi32:
+; NoVLX-NEXT:  .Lcfi12:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi33:
+; NoVLX-NEXT:  .Lcfi13:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi34:
+; NoVLX-NEXT:  .Lcfi14:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -537,24 +457,20 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi35:
+; NoVLX-NEXT:  .Lcfi15:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi36:
+; NoVLX-NEXT:  .Lcfi16:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi37:
+; NoVLX-NEXT:  .Lcfi17:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi38:
+; NoVLX-NEXT:  .Lcfi18:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi39:
+; NoVLX-NEXT:  .Lcfi19:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -597,11 +513,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -613,11 +529,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -653,12 +573,12 @@
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi40:
+; NoVLX-NEXT:  .Lcfi20:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi41:
+; NoVLX-NEXT:  .Lcfi21:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi42:
+; NoVLX-NEXT:  .Lcfi22:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -667,24 +587,20 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi43:
+; NoVLX-NEXT:  .Lcfi23:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi44:
+; NoVLX-NEXT:  .Lcfi24:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi45:
+; NoVLX-NEXT:  .Lcfi25:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi46:
+; NoVLX-NEXT:  .Lcfi26:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi47:
+; NoVLX-NEXT:  .Lcfi27:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -727,11 +643,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -743,11 +659,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -785,12 +705,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi48:
+; NoVLX-NEXT:  .Lcfi28:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi49:
+; NoVLX-NEXT:  .Lcfi29:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi50:
+; NoVLX-NEXT:  .Lcfi30:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -799,25 +719,21 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi51:
+; NoVLX-NEXT:  .Lcfi31:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi52:
+; NoVLX-NEXT:  .Lcfi32:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi53:
+; NoVLX-NEXT:  .Lcfi33:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi54:
+; NoVLX-NEXT:  .Lcfi34:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi55:
+; NoVLX-NEXT:  .Lcfi35:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -860,11 +776,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -876,11 +792,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -919,12 +839,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi56:
+; NoVLX-NEXT:  .Lcfi36:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi57:
+; NoVLX-NEXT:  .Lcfi37:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi58:
+; NoVLX-NEXT:  .Lcfi38:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -933,25 +853,21 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi59:
+; NoVLX-NEXT:  .Lcfi39:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi60:
+; NoVLX-NEXT:  .Lcfi40:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi61:
+; NoVLX-NEXT:  .Lcfi41:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi62:
+; NoVLX-NEXT:  .Lcfi42:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi63:
+; NoVLX-NEXT:  .Lcfi43:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -994,11 +910,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -1010,11 +926,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1055,12 +975,12 @@
 ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi64:
+; NoVLX-NEXT:  .Lcfi44:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi65:
+; NoVLX-NEXT:  .Lcfi45:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi66:
+; NoVLX-NEXT:  .Lcfi46:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1104,12 +1024,12 @@
 ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi67:
+; NoVLX-NEXT:  .Lcfi47:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi68:
+; NoVLX-NEXT:  .Lcfi48:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi69:
+; NoVLX-NEXT:  .Lcfi49:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1155,12 +1075,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi70:
+; NoVLX-NEXT:  .Lcfi50:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi71:
+; NoVLX-NEXT:  .Lcfi51:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi72:
+; NoVLX-NEXT:  .Lcfi52:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -1216,12 +1136,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi73:
+; NoVLX-NEXT:  .Lcfi53:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi74:
+; NoVLX-NEXT:  .Lcfi54:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi75:
+; NoVLX-NEXT:  .Lcfi55:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -1400,12 +1320,12 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi76:
+; NoVLX-NEXT:  .Lcfi56:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi77:
+; NoVLX-NEXT:  .Lcfi57:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi78:
+; NoVLX-NEXT:  .Lcfi58:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -1475,12 +1395,12 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi79:
+; NoVLX-NEXT:  .Lcfi59:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi80:
+; NoVLX-NEXT:  .Lcfi60:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi81:
+; NoVLX-NEXT:  .Lcfi61:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -1552,12 +1472,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi82:
+; NoVLX-NEXT:  .Lcfi62:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi83:
+; NoVLX-NEXT:  .Lcfi63:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi84:
+; NoVLX-NEXT:  .Lcfi64:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -1631,12 +1551,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi85:
+; NoVLX-NEXT:  .Lcfi65:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi86:
+; NoVLX-NEXT:  .Lcfi66:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi87:
+; NoVLX-NEXT:  .Lcfi67:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -1711,12 +1631,12 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi88:
+; NoVLX-NEXT:  .Lcfi68:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi89:
+; NoVLX-NEXT:  .Lcfi69:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi90:
+; NoVLX-NEXT:  .Lcfi70:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1724,43 +1644,43 @@
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1791,12 +1711,12 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi91:
+; NoVLX-NEXT:  .Lcfi71:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi92:
+; NoVLX-NEXT:  .Lcfi72:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi93:
+; NoVLX-NEXT:  .Lcfi73:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1804,43 +1724,43 @@
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1873,12 +1793,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi94:
+; NoVLX-NEXT:  .Lcfi74:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi95:
+; NoVLX-NEXT:  .Lcfi75:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi96:
+; NoVLX-NEXT:  .Lcfi76:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1887,43 +1807,43 @@
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1957,12 +1877,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi97:
+; NoVLX-NEXT:  .Lcfi77:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi98:
+; NoVLX-NEXT:  .Lcfi78:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi99:
+; NoVLX-NEXT:  .Lcfi79:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1971,43 +1891,43 @@
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2043,30 +1963,15 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi100:
+; NoVLX-NEXT:  .Lcfi80:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi101:
+; NoVLX-NEXT:  .Lcfi81:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi102:
+; NoVLX-NEXT:  .Lcfi82:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi103:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi104:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi105:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi106:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi107:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -2075,64 +1980,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2141,12 +2046,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -2169,30 +2069,15 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi108:
+; NoVLX-NEXT:  .Lcfi83:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi109:
+; NoVLX-NEXT:  .Lcfi84:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi110:
+; NoVLX-NEXT:  .Lcfi85:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi111:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi112:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi113:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi114:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi115:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -2201,64 +2086,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2267,12 +2152,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -2297,30 +2177,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi116:
+; NoVLX-NEXT:  .Lcfi86:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi117:
+; NoVLX-NEXT:  .Lcfi87:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi118:
+; NoVLX-NEXT:  .Lcfi88:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi119:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi120:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi121:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi122:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi123:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -2330,64 +2195,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2396,12 +2261,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -2427,30 +2287,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi124:
+; NoVLX-NEXT:  .Lcfi89:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi125:
+; NoVLX-NEXT:  .Lcfi90:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi126:
+; NoVLX-NEXT:  .Lcfi91:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi127:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi128:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi129:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi130:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi131:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -2460,64 +2305,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2526,12 +2371,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -2558,12 +2398,12 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi132:
+; NoVLX-NEXT:  .Lcfi92:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi133:
+; NoVLX-NEXT:  .Lcfi93:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi134:
+; NoVLX-NEXT:  .Lcfi94:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -2572,24 +2412,20 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi135:
+; NoVLX-NEXT:  .Lcfi95:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi136:
+; NoVLX-NEXT:  .Lcfi96:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi137:
+; NoVLX-NEXT:  .Lcfi97:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi138:
+; NoVLX-NEXT:  .Lcfi98:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi139:
+; NoVLX-NEXT:  .Lcfi99:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -2632,11 +2468,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2648,11 +2484,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2689,12 +2529,12 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi140:
+; NoVLX-NEXT:  .Lcfi100:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi141:
+; NoVLX-NEXT:  .Lcfi101:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi142:
+; NoVLX-NEXT:  .Lcfi102:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -2703,24 +2543,20 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi143:
+; NoVLX-NEXT:  .Lcfi103:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi144:
+; NoVLX-NEXT:  .Lcfi104:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi145:
+; NoVLX-NEXT:  .Lcfi105:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi146:
+; NoVLX-NEXT:  .Lcfi106:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi147:
+; NoVLX-NEXT:  .Lcfi107:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -2763,11 +2599,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2779,11 +2615,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2822,12 +2662,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi148:
+; NoVLX-NEXT:  .Lcfi108:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi149:
+; NoVLX-NEXT:  .Lcfi109:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi150:
+; NoVLX-NEXT:  .Lcfi110:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -2836,25 +2676,21 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi151:
+; NoVLX-NEXT:  .Lcfi111:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi152:
+; NoVLX-NEXT:  .Lcfi112:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi153:
+; NoVLX-NEXT:  .Lcfi113:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi154:
+; NoVLX-NEXT:  .Lcfi114:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi155:
+; NoVLX-NEXT:  .Lcfi115:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -2897,11 +2733,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2913,11 +2749,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2957,12 +2797,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi156:
+; NoVLX-NEXT:  .Lcfi116:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi157:
+; NoVLX-NEXT:  .Lcfi117:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi158:
+; NoVLX-NEXT:  .Lcfi118:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -2971,25 +2811,21 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi159:
+; NoVLX-NEXT:  .Lcfi119:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi160:
+; NoVLX-NEXT:  .Lcfi120:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi161:
+; NoVLX-NEXT:  .Lcfi121:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi162:
+; NoVLX-NEXT:  .Lcfi122:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi163:
+; NoVLX-NEXT:  .Lcfi123:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -3032,11 +2868,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -3048,11 +2884,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -3093,32 +2933,47 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi164:
+; NoVLX-NEXT:  .Lcfi124:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi165:
+; NoVLX-NEXT:  .Lcfi125:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi166:
+; NoVLX-NEXT:  .Lcfi126:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -3126,7 +2981,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -3136,6 +2992,27 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
@@ -3146,97 +3023,83 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
 ; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm3, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm8, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -3244,102 +3107,79 @@
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
@@ -3444,29 +3284,47 @@
 ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi167:
+; NoVLX-NEXT:  .Lcfi127:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi168:
+; NoVLX-NEXT:  .Lcfi128:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi169:
+; NoVLX-NEXT:  .Lcfi129:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -3474,39 +3332,20 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -3514,7 +3353,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -3524,19 +3364,19 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm0
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
 ; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpeqw 32(%rdi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
@@ -3712,12 +3552,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi170:
+; NoVLX-NEXT:  .Lcfi130:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi171:
+; NoVLX-NEXT:  .Lcfi131:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi172:
+; NoVLX-NEXT:  .Lcfi132:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -3728,17 +3568,12 @@
 ; NoVLX-NEXT:    movq %rax, %rdx
 ; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -3746,9 +3581,10 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm4
@@ -3766,87 +3602,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -3856,9 +3613,50 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
@@ -3866,29 +3664,66 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm6
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm6
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm3
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm4
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -3897,9 +3732,76 @@
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm4, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpcmpeqw %ymm4, %ymm1, %ymm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm1
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
@@ -3958,86 +3860,24 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -4075,12 +3915,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi173:
+; NoVLX-NEXT:  .Lcfi133:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi174:
+; NoVLX-NEXT:  .Lcfi134:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi175:
+; NoVLX-NEXT:  .Lcfi135:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -4092,8 +3932,6 @@
 ; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
@@ -4106,19 +3944,20 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -4126,6 +3965,7 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
@@ -4148,112 +3988,40 @@
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm2, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %eax, %xmm2
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpcmpeqw 32(%rsi), %ymm3, %ymm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm4
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm3, %ymm3
 ; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
 ; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -4307,15 +4075,87 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm1
+; NoVLX-NEXT:    vpcmpeqw 32(%rsi), %ymm2, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -4370,8 +4210,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4425,8 +4265,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4465,7 +4305,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -4477,13 +4316,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4500,8 +4340,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4577,8 +4417,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4638,8 +4478,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4680,7 +4520,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -4692,13 +4531,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4715,8 +4555,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4775,8 +4615,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4829,8 +4669,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4869,7 +4709,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -4881,13 +4720,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4903,8 +4743,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4979,8 +4819,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5039,8 +4879,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5081,7 +4921,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -5093,13 +4932,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -5115,8 +4955,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5159,12 +4999,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi176:
+; NoVLX-NEXT:  .Lcfi136:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi177:
+; NoVLX-NEXT:  .Lcfi137:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi178:
+; NoVLX-NEXT:  .Lcfi138:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -5202,12 +5042,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi179:
+; NoVLX-NEXT:  .Lcfi139:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi180:
+; NoVLX-NEXT:  .Lcfi140:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi181:
+; NoVLX-NEXT:  .Lcfi141:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -5247,16 +5087,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi182:
+; NoVLX-NEXT:  .Lcfi142:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi183:
+; NoVLX-NEXT:  .Lcfi143:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi184:
+; NoVLX-NEXT:  .Lcfi144:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -5268,13 +5107,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -5312,12 +5152,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi185:
+; NoVLX-NEXT:  .Lcfi145:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi186:
+; NoVLX-NEXT:  .Lcfi146:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi187:
+; NoVLX-NEXT:  .Lcfi147:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -5378,12 +5218,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi188:
+; NoVLX-NEXT:  .Lcfi148:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi189:
+; NoVLX-NEXT:  .Lcfi149:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi190:
+; NoVLX-NEXT:  .Lcfi150:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -5425,17 +5265,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi191:
+; NoVLX-NEXT:  .Lcfi151:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi192:
+; NoVLX-NEXT:  .Lcfi152:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi193:
+; NoVLX-NEXT:  .Lcfi153:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -5447,13 +5286,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -5493,20 +5333,20 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi194:
+; NoVLX-NEXT:  .Lcfi154:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi195:
+; NoVLX-NEXT:  .Lcfi155:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi196:
+; NoVLX-NEXT:  .Lcfi156:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5542,20 +5382,20 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi197:
+; NoVLX-NEXT:  .Lcfi157:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi198:
+; NoVLX-NEXT:  .Lcfi158:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi199:
+; NoVLX-NEXT:  .Lcfi159:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5593,19 +5433,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi200:
+; NoVLX-NEXT:  .Lcfi160:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi201:
+; NoVLX-NEXT:  .Lcfi161:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi202:
+; NoVLX-NEXT:  .Lcfi162:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -5618,13 +5457,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5664,19 +5504,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi203:
+; NoVLX-NEXT:  .Lcfi163:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi204:
+; NoVLX-NEXT:  .Lcfi164:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi205:
+; NoVLX-NEXT:  .Lcfi165:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -5689,13 +5528,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5736,12 +5576,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi206:
+; NoVLX-NEXT:  .Lcfi166:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi207:
+; NoVLX-NEXT:  .Lcfi167:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi208:
+; NoVLX-NEXT:  .Lcfi168:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -5749,8 +5589,8 @@
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5789,12 +5629,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi209:
+; NoVLX-NEXT:  .Lcfi169:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi210:
+; NoVLX-NEXT:  .Lcfi170:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi211:
+; NoVLX-NEXT:  .Lcfi171:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -5802,7 +5642,6 @@
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -5815,13 +5654,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -6052,12 +5892,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi212:
+; NoVLX-NEXT:  .Lcfi172:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi213:
+; NoVLX-NEXT:  .Lcfi173:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi214:
+; NoVLX-NEXT:  .Lcfi174:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6127,12 +5967,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi215:
+; NoVLX-NEXT:  .Lcfi175:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi216:
+; NoVLX-NEXT:  .Lcfi176:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi217:
+; NoVLX-NEXT:  .Lcfi177:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6204,12 +6044,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi218:
+; NoVLX-NEXT:  .Lcfi178:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi219:
+; NoVLX-NEXT:  .Lcfi179:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi220:
+; NoVLX-NEXT:  .Lcfi180:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6284,12 +6124,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi221:
+; NoVLX-NEXT:  .Lcfi181:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi222:
+; NoVLX-NEXT:  .Lcfi182:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi223:
+; NoVLX-NEXT:  .Lcfi183:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6365,12 +6205,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi224:
+; NoVLX-NEXT:  .Lcfi184:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi225:
+; NoVLX-NEXT:  .Lcfi185:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi226:
+; NoVLX-NEXT:  .Lcfi186:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6443,12 +6283,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi227:
+; NoVLX-NEXT:  .Lcfi187:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi228:
+; NoVLX-NEXT:  .Lcfi188:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi229:
+; NoVLX-NEXT:  .Lcfi189:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6525,55 +6365,55 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi230:
+; NoVLX-NEXT:  .Lcfi190:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi231:
+; NoVLX-NEXT:  .Lcfi191:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi232:
+; NoVLX-NEXT:  .Lcfi192:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6605,55 +6445,55 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi233:
+; NoVLX-NEXT:  .Lcfi193:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi234:
+; NoVLX-NEXT:  .Lcfi194:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi235:
+; NoVLX-NEXT:  .Lcfi195:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6687,12 +6527,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi236:
+; NoVLX-NEXT:  .Lcfi196:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi237:
+; NoVLX-NEXT:  .Lcfi197:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi238:
+; NoVLX-NEXT:  .Lcfi198:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -6701,43 +6541,43 @@
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6772,12 +6612,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi239:
+; NoVLX-NEXT:  .Lcfi199:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi240:
+; NoVLX-NEXT:  .Lcfi200:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi241:
+; NoVLX-NEXT:  .Lcfi201:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -6786,43 +6626,43 @@
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6858,55 +6698,55 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi242:
+; NoVLX-NEXT:  .Lcfi202:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi243:
+; NoVLX-NEXT:  .Lcfi203:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi244:
+; NoVLX-NEXT:  .Lcfi204:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6941,12 +6781,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi245:
+; NoVLX-NEXT:  .Lcfi205:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi246:
+; NoVLX-NEXT:  .Lcfi206:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi247:
+; NoVLX-NEXT:  .Lcfi207:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -6955,43 +6795,43 @@
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k0, %k1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -7028,93 +6868,78 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi248:
+; NoVLX-NEXT:  .Lcfi208:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi249:
+; NoVLX-NEXT:  .Lcfi209:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi250:
+; NoVLX-NEXT:  .Lcfi210:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi251:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi252:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi253:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi254:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi255:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7123,12 +6948,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7151,93 +6971,78 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi256:
+; NoVLX-NEXT:  .Lcfi211:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi257:
+; NoVLX-NEXT:  .Lcfi212:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi258:
+; NoVLX-NEXT:  .Lcfi213:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi259:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi260:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi261:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi262:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi263:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7246,12 +7051,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7276,94 +7076,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi264:
+; NoVLX-NEXT:  .Lcfi214:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi265:
+; NoVLX-NEXT:  .Lcfi215:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi266:
+; NoVLX-NEXT:  .Lcfi216:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi267:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi268:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi269:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi270:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi271:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7372,12 +7157,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7403,94 +7183,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi272:
+; NoVLX-NEXT:  .Lcfi217:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi273:
+; NoVLX-NEXT:  .Lcfi218:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi274:
+; NoVLX-NEXT:  .Lcfi219:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi275:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi276:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi277:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi278:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi279:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7499,12 +7264,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7531,93 +7291,78 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi280:
+; NoVLX-NEXT:  .Lcfi220:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi281:
+; NoVLX-NEXT:  .Lcfi221:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi282:
+; NoVLX-NEXT:  .Lcfi222:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi283:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi284:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi285:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi286:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi287:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7626,12 +7371,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7657,94 +7397,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi288:
+; NoVLX-NEXT:  .Lcfi223:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi289:
+; NoVLX-NEXT:  .Lcfi224:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi290:
+; NoVLX-NEXT:  .Lcfi225:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi291:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi292:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi293:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi294:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi295:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7753,12 +7478,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7786,12 +7506,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi296:
+; NoVLX-NEXT:  .Lcfi226:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi297:
+; NoVLX-NEXT:  .Lcfi227:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi298:
+; NoVLX-NEXT:  .Lcfi228:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -7800,21 +7520,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi299:
+; NoVLX-NEXT:  .Lcfi229:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi300:
+; NoVLX-NEXT:  .Lcfi230:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi301:
+; NoVLX-NEXT:  .Lcfi231:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi302:
+; NoVLX-NEXT:  .Lcfi232:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi303:
+; NoVLX-NEXT:  .Lcfi233:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -7857,11 +7573,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -7873,11 +7589,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -7914,12 +7634,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi304:
+; NoVLX-NEXT:  .Lcfi234:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi305:
+; NoVLX-NEXT:  .Lcfi235:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi306:
+; NoVLX-NEXT:  .Lcfi236:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -7928,21 +7648,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi307:
+; NoVLX-NEXT:  .Lcfi237:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi308:
+; NoVLX-NEXT:  .Lcfi238:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi309:
+; NoVLX-NEXT:  .Lcfi239:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi310:
+; NoVLX-NEXT:  .Lcfi240:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi311:
+; NoVLX-NEXT:  .Lcfi241:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -7985,11 +7701,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8001,11 +7717,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8044,12 +7764,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi312:
+; NoVLX-NEXT:  .Lcfi242:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi313:
+; NoVLX-NEXT:  .Lcfi243:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi314:
+; NoVLX-NEXT:  .Lcfi244:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -8058,22 +7778,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi315:
+; NoVLX-NEXT:  .Lcfi245:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi316:
+; NoVLX-NEXT:  .Lcfi246:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi317:
+; NoVLX-NEXT:  .Lcfi247:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi318:
+; NoVLX-NEXT:  .Lcfi248:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi319:
+; NoVLX-NEXT:  .Lcfi249:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -8116,11 +7832,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8132,11 +7848,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8176,12 +7896,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi320:
+; NoVLX-NEXT:  .Lcfi250:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi321:
+; NoVLX-NEXT:  .Lcfi251:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi322:
+; NoVLX-NEXT:  .Lcfi252:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -8190,22 +7910,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi323:
+; NoVLX-NEXT:  .Lcfi253:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi324:
+; NoVLX-NEXT:  .Lcfi254:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi325:
+; NoVLX-NEXT:  .Lcfi255:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi326:
+; NoVLX-NEXT:  .Lcfi256:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi327:
+; NoVLX-NEXT:  .Lcfi257:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -8248,11 +7964,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8264,11 +7980,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8309,12 +8029,12 @@
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi328:
+; NoVLX-NEXT:  .Lcfi258:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi329:
+; NoVLX-NEXT:  .Lcfi259:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi330:
+; NoVLX-NEXT:  .Lcfi260:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -8323,21 +8043,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi331:
+; NoVLX-NEXT:  .Lcfi261:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi332:
+; NoVLX-NEXT:  .Lcfi262:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi333:
+; NoVLX-NEXT:  .Lcfi263:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi334:
+; NoVLX-NEXT:  .Lcfi264:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi335:
+; NoVLX-NEXT:  .Lcfi265:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -8380,11 +8096,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8396,11 +8112,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8440,12 +8160,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi336:
+; NoVLX-NEXT:  .Lcfi266:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi337:
+; NoVLX-NEXT:  .Lcfi267:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi338:
+; NoVLX-NEXT:  .Lcfi268:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -8454,22 +8174,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi339:
+; NoVLX-NEXT:  .Lcfi269:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi340:
+; NoVLX-NEXT:  .Lcfi270:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi341:
+; NoVLX-NEXT:  .Lcfi271:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi342:
+; NoVLX-NEXT:  .Lcfi272:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi343:
+; NoVLX-NEXT:  .Lcfi273:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -8512,11 +8228,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8528,11 +8244,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8629,7 +8349,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -8637,9 +8356,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8744,7 +8464,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -8752,9 +8471,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -9353,12 +9073,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi344:
+; NoVLX-NEXT:  .Lcfi274:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi345:
+; NoVLX-NEXT:  .Lcfi275:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi346:
+; NoVLX-NEXT:  .Lcfi276:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -9396,12 +9116,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi347:
+; NoVLX-NEXT:  .Lcfi277:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi348:
+; NoVLX-NEXT:  .Lcfi278:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi349:
+; NoVLX-NEXT:  .Lcfi279:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -9441,16 +9161,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi350:
+; NoVLX-NEXT:  .Lcfi280:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi351:
+; NoVLX-NEXT:  .Lcfi281:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi352:
+; NoVLX-NEXT:  .Lcfi282:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -9458,9 +9177,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -9498,12 +9218,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi353:
+; NoVLX-NEXT:  .Lcfi283:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi354:
+; NoVLX-NEXT:  .Lcfi284:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi355:
+; NoVLX-NEXT:  .Lcfi285:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -9556,12 +9276,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi356:
+; NoVLX-NEXT:  .Lcfi286:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi357:
+; NoVLX-NEXT:  .Lcfi287:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi358:
+; NoVLX-NEXT:  .Lcfi288:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -9603,17 +9323,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi359:
+; NoVLX-NEXT:  .Lcfi289:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi360:
+; NoVLX-NEXT:  .Lcfi290:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi361:
+; NoVLX-NEXT:  .Lcfi291:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -9621,9 +9340,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -9663,20 +9383,20 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi362:
+; NoVLX-NEXT:  .Lcfi292:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi363:
+; NoVLX-NEXT:  .Lcfi293:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi364:
+; NoVLX-NEXT:  .Lcfi294:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9712,20 +9432,20 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi365:
+; NoVLX-NEXT:  .Lcfi295:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi366:
+; NoVLX-NEXT:  .Lcfi296:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi367:
+; NoVLX-NEXT:  .Lcfi297:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9763,12 +9483,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi368:
+; NoVLX-NEXT:  .Lcfi298:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi369:
+; NoVLX-NEXT:  .Lcfi299:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi370:
+; NoVLX-NEXT:  .Lcfi300:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -9785,8 +9505,8 @@
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9826,12 +9546,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi371:
+; NoVLX-NEXT:  .Lcfi301:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi372:
+; NoVLX-NEXT:  .Lcfi302:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi373:
+; NoVLX-NEXT:  .Lcfi303:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -9848,8 +9568,8 @@
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9890,12 +9610,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi374:
+; NoVLX-NEXT:  .Lcfi304:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi375:
+; NoVLX-NEXT:  .Lcfi305:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi376:
+; NoVLX-NEXT:  .Lcfi306:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -9903,8 +9623,8 @@
 ; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9943,12 +9663,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi377:
+; NoVLX-NEXT:  .Lcfi307:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi378:
+; NoVLX-NEXT:  .Lcfi308:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi379:
+; NoVLX-NEXT:  .Lcfi309:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -9966,8 +9686,8 @@
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -10028,8 +9748,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10085,8 +9805,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10127,7 +9847,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10145,6 +9864,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10162,8 +9882,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10206,7 +9926,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10224,6 +9943,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10241,8 +9961,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10304,8 +10024,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10348,7 +10068,6 @@
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10366,6 +10085,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10383,8 +10103,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10445,8 +10165,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10501,8 +10221,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10543,7 +10263,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10561,6 +10280,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10577,8 +10297,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10621,7 +10341,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10639,6 +10358,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10655,8 +10375,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10717,8 +10437,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10761,7 +10481,6 @@
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10779,6 +10498,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10795,8 +10515,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10840,12 +10560,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi380:
+; NoVLX-NEXT:  .Lcfi310:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi381:
+; NoVLX-NEXT:  .Lcfi311:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi382:
+; NoVLX-NEXT:  .Lcfi312:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -10885,12 +10605,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi383:
+; NoVLX-NEXT:  .Lcfi313:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi384:
+; NoVLX-NEXT:  .Lcfi314:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi385:
+; NoVLX-NEXT:  .Lcfi315:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -10932,17 +10652,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi386:
+; NoVLX-NEXT:  .Lcfi316:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi387:
+; NoVLX-NEXT:  .Lcfi317:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi388:
+; NoVLX-NEXT:  .Lcfi318:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10960,6 +10679,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -10999,17 +10719,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi389:
+; NoVLX-NEXT:  .Lcfi319:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi390:
+; NoVLX-NEXT:  .Lcfi320:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi391:
+; NoVLX-NEXT:  .Lcfi321:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -11027,6 +10746,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -11067,12 +10787,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi392:
+; NoVLX-NEXT:  .Lcfi322:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi393:
+; NoVLX-NEXT:  .Lcfi323:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi394:
+; NoVLX-NEXT:  .Lcfi324:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -11116,18 +10836,17 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi395:
+; NoVLX-NEXT:  .Lcfi325:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi396:
+; NoVLX-NEXT:  .Lcfi326:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi397:
+; NoVLX-NEXT:  .Lcfi327:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -11145,6 +10864,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -11186,12 +10906,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi398:
+; NoVLX-NEXT:  .Lcfi328:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi399:
+; NoVLX-NEXT:  .Lcfi329:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi400:
+; NoVLX-NEXT:  .Lcfi330:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11199,8 +10919,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11237,12 +10957,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi401:
+; NoVLX-NEXT:  .Lcfi331:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi402:
+; NoVLX-NEXT:  .Lcfi332:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi403:
+; NoVLX-NEXT:  .Lcfi333:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11250,8 +10970,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11290,12 +11010,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi404:
+; NoVLX-NEXT:  .Lcfi334:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi405:
+; NoVLX-NEXT:  .Lcfi335:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi406:
+; NoVLX-NEXT:  .Lcfi336:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11303,7 +11023,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -11316,13 +11035,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11363,12 +11083,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi407:
+; NoVLX-NEXT:  .Lcfi337:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi408:
+; NoVLX-NEXT:  .Lcfi338:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi409:
+; NoVLX-NEXT:  .Lcfi339:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11376,7 +11096,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -11389,13 +11108,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11437,12 +11157,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi410:
+; NoVLX-NEXT:  .Lcfi340:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi411:
+; NoVLX-NEXT:  .Lcfi341:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi412:
+; NoVLX-NEXT:  .Lcfi342:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11451,8 +11171,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11492,12 +11212,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi413:
+; NoVLX-NEXT:  .Lcfi343:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi414:
+; NoVLX-NEXT:  .Lcfi344:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi415:
+; NoVLX-NEXT:  .Lcfi345:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11506,7 +11226,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -11519,13 +11238,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11732,12 +11452,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi416:
+; NoVLX-NEXT:  .Lcfi346:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi417:
+; NoVLX-NEXT:  .Lcfi347:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi418:
+; NoVLX-NEXT:  .Lcfi348:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -11805,12 +11525,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi419:
+; NoVLX-NEXT:  .Lcfi349:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi420:
+; NoVLX-NEXT:  .Lcfi350:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi421:
+; NoVLX-NEXT:  .Lcfi351:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -11880,12 +11600,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi422:
+; NoVLX-NEXT:  .Lcfi352:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi423:
+; NoVLX-NEXT:  .Lcfi353:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi424:
+; NoVLX-NEXT:  .Lcfi354:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -11957,12 +11677,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi425:
+; NoVLX-NEXT:  .Lcfi355:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi426:
+; NoVLX-NEXT:  .Lcfi356:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi427:
+; NoVLX-NEXT:  .Lcfi357:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -12035,12 +11755,12 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi428:
+; NoVLX-NEXT:  .Lcfi358:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi429:
+; NoVLX-NEXT:  .Lcfi359:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi430:
+; NoVLX-NEXT:  .Lcfi360:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -12111,12 +11831,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi431:
+; NoVLX-NEXT:  .Lcfi361:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi432:
+; NoVLX-NEXT:  .Lcfi362:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi433:
+; NoVLX-NEXT:  .Lcfi363:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -12190,53 +11910,53 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi434:
+; NoVLX-NEXT:  .Lcfi364:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi435:
+; NoVLX-NEXT:  .Lcfi365:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi436:
+; NoVLX-NEXT:  .Lcfi366:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12268,53 +11988,53 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi437:
+; NoVLX-NEXT:  .Lcfi367:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi438:
+; NoVLX-NEXT:  .Lcfi368:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi439:
+; NoVLX-NEXT:  .Lcfi369:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12348,54 +12068,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi440:
+; NoVLX-NEXT:  .Lcfi370:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi441:
+; NoVLX-NEXT:  .Lcfi371:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi442:
+; NoVLX-NEXT:  .Lcfi372:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12430,54 +12150,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi443:
+; NoVLX-NEXT:  .Lcfi373:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi444:
+; NoVLX-NEXT:  .Lcfi374:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi445:
+; NoVLX-NEXT:  .Lcfi375:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12513,53 +12233,53 @@
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi446:
+; NoVLX-NEXT:  .Lcfi376:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi447:
+; NoVLX-NEXT:  .Lcfi377:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi448:
+; NoVLX-NEXT:  .Lcfi378:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12594,54 +12314,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi449:
+; NoVLX-NEXT:  .Lcfi379:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi450:
+; NoVLX-NEXT:  .Lcfi380:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi451:
+; NoVLX-NEXT:  .Lcfi381:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12677,30 +12397,15 @@
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi452:
+; NoVLX-NEXT:  .Lcfi382:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi453:
+; NoVLX-NEXT:  .Lcfi383:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi454:
+; NoVLX-NEXT:  .Lcfi384:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi455:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi456:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi457:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi458:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi459:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -12709,64 +12414,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -12775,12 +12480,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -12802,30 +12502,15 @@
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi460:
+; NoVLX-NEXT:  .Lcfi385:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi461:
+; NoVLX-NEXT:  .Lcfi386:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi462:
+; NoVLX-NEXT:  .Lcfi387:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi463:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi464:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi465:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi466:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi467:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -12834,64 +12519,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -12900,12 +12585,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -12929,30 +12609,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi468:
+; NoVLX-NEXT:  .Lcfi388:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi469:
+; NoVLX-NEXT:  .Lcfi389:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi470:
+; NoVLX-NEXT:  .Lcfi390:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi471:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi472:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi473:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi474:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi475:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -12962,64 +12627,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -13028,12 +12693,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -13058,30 +12718,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi476:
+; NoVLX-NEXT:  .Lcfi391:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi477:
+; NoVLX-NEXT:  .Lcfi392:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi478:
+; NoVLX-NEXT:  .Lcfi393:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi479:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi480:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi481:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi482:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi483:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -13091,64 +12736,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -13157,12 +12802,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -13188,12 +12828,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi484:
+; NoVLX-NEXT:  .Lcfi394:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi485:
+; NoVLX-NEXT:  .Lcfi395:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi486:
+; NoVLX-NEXT:  .Lcfi396:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -13202,24 +12842,20 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi487:
+; NoVLX-NEXT:  .Lcfi397:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi488:
+; NoVLX-NEXT:  .Lcfi398:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi489:
+; NoVLX-NEXT:  .Lcfi399:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi490:
+; NoVLX-NEXT:  .Lcfi400:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi491:
+; NoVLX-NEXT:  .Lcfi401:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -13262,11 +12898,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13278,11 +12914,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -13318,12 +12958,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi492:
+; NoVLX-NEXT:  .Lcfi402:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi493:
+; NoVLX-NEXT:  .Lcfi403:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi494:
+; NoVLX-NEXT:  .Lcfi404:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -13332,24 +12972,20 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi495:
+; NoVLX-NEXT:  .Lcfi405:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi496:
+; NoVLX-NEXT:  .Lcfi406:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi497:
+; NoVLX-NEXT:  .Lcfi407:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi498:
+; NoVLX-NEXT:  .Lcfi408:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi499:
+; NoVLX-NEXT:  .Lcfi409:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -13392,11 +13028,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13408,11 +13044,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -13450,12 +13090,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi500:
+; NoVLX-NEXT:  .Lcfi410:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi501:
+; NoVLX-NEXT:  .Lcfi411:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi502:
+; NoVLX-NEXT:  .Lcfi412:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -13464,25 +13104,21 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi503:
+; NoVLX-NEXT:  .Lcfi413:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi504:
+; NoVLX-NEXT:  .Lcfi414:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi505:
+; NoVLX-NEXT:  .Lcfi415:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi506:
+; NoVLX-NEXT:  .Lcfi416:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi507:
+; NoVLX-NEXT:  .Lcfi417:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -13525,11 +13161,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13541,11 +13177,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -13584,12 +13224,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi508:
+; NoVLX-NEXT:  .Lcfi418:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi509:
+; NoVLX-NEXT:  .Lcfi419:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi510:
+; NoVLX-NEXT:  .Lcfi420:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -13598,25 +13238,21 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi511:
+; NoVLX-NEXT:  .Lcfi421:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi512:
+; NoVLX-NEXT:  .Lcfi422:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi513:
+; NoVLX-NEXT:  .Lcfi423:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi514:
+; NoVLX-NEXT:  .Lcfi424:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi515:
+; NoVLX-NEXT:  .Lcfi425:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -13659,11 +13295,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13675,11 +13311,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -13720,12 +13360,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi516:
+; NoVLX-NEXT:  .Lcfi426:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi517:
+; NoVLX-NEXT:  .Lcfi427:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi518:
+; NoVLX-NEXT:  .Lcfi428:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -13769,12 +13409,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi519:
+; NoVLX-NEXT:  .Lcfi429:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi520:
+; NoVLX-NEXT:  .Lcfi430:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi521:
+; NoVLX-NEXT:  .Lcfi431:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -13820,12 +13460,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi522:
+; NoVLX-NEXT:  .Lcfi432:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi523:
+; NoVLX-NEXT:  .Lcfi433:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi524:
+; NoVLX-NEXT:  .Lcfi434:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -13881,12 +13521,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi525:
+; NoVLX-NEXT:  .Lcfi435:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi526:
+; NoVLX-NEXT:  .Lcfi436:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi527:
+; NoVLX-NEXT:  .Lcfi437:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -14065,12 +13705,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi528:
+; NoVLX-NEXT:  .Lcfi438:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi529:
+; NoVLX-NEXT:  .Lcfi439:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi530:
+; NoVLX-NEXT:  .Lcfi440:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -14140,12 +13780,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi531:
+; NoVLX-NEXT:  .Lcfi441:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi532:
+; NoVLX-NEXT:  .Lcfi442:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi533:
+; NoVLX-NEXT:  .Lcfi443:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -14217,12 +13857,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi534:
+; NoVLX-NEXT:  .Lcfi444:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi535:
+; NoVLX-NEXT:  .Lcfi445:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi536:
+; NoVLX-NEXT:  .Lcfi446:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -14296,12 +13936,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi537:
+; NoVLX-NEXT:  .Lcfi447:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi538:
+; NoVLX-NEXT:  .Lcfi448:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi539:
+; NoVLX-NEXT:  .Lcfi449:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -14376,12 +14016,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi540:
+; NoVLX-NEXT:  .Lcfi450:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi541:
+; NoVLX-NEXT:  .Lcfi451:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi542:
+; NoVLX-NEXT:  .Lcfi452:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -14389,43 +14029,43 @@
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -14456,12 +14096,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi543:
+; NoVLX-NEXT:  .Lcfi453:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi544:
+; NoVLX-NEXT:  .Lcfi454:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi545:
+; NoVLX-NEXT:  .Lcfi455:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -14469,43 +14109,43 @@
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -14538,12 +14178,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi546:
+; NoVLX-NEXT:  .Lcfi456:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi547:
+; NoVLX-NEXT:  .Lcfi457:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi548:
+; NoVLX-NEXT:  .Lcfi458:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -14552,43 +14192,43 @@
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -14622,12 +14262,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi549:
+; NoVLX-NEXT:  .Lcfi459:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi550:
+; NoVLX-NEXT:  .Lcfi460:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi551:
+; NoVLX-NEXT:  .Lcfi461:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -14636,43 +14276,43 @@
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -14708,30 +14348,15 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi552:
+; NoVLX-NEXT:  .Lcfi462:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi553:
+; NoVLX-NEXT:  .Lcfi463:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi554:
+; NoVLX-NEXT:  .Lcfi464:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi555:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi556:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi557:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi558:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi559:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14740,64 +14365,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -14806,12 +14431,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -14834,30 +14454,15 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi560:
+; NoVLX-NEXT:  .Lcfi465:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi561:
+; NoVLX-NEXT:  .Lcfi466:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi562:
+; NoVLX-NEXT:  .Lcfi467:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi563:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi564:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi565:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi566:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi567:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14866,64 +14471,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -14932,12 +14537,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -14962,30 +14562,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi568:
+; NoVLX-NEXT:  .Lcfi468:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi569:
+; NoVLX-NEXT:  .Lcfi469:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi570:
+; NoVLX-NEXT:  .Lcfi470:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi571:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi572:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi573:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi574:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi575:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14995,64 +14580,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -15061,12 +14646,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -15092,30 +14672,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi576:
+; NoVLX-NEXT:  .Lcfi471:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi577:
+; NoVLX-NEXT:  .Lcfi472:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi578:
+; NoVLX-NEXT:  .Lcfi473:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi579:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi580:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi581:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi582:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi583:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -15125,64 +14690,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -15191,12 +14756,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -15223,12 +14783,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi584:
+; NoVLX-NEXT:  .Lcfi474:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi585:
+; NoVLX-NEXT:  .Lcfi475:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi586:
+; NoVLX-NEXT:  .Lcfi476:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -15237,24 +14797,20 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi587:
+; NoVLX-NEXT:  .Lcfi477:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi588:
+; NoVLX-NEXT:  .Lcfi478:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi589:
+; NoVLX-NEXT:  .Lcfi479:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi590:
+; NoVLX-NEXT:  .Lcfi480:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi591:
+; NoVLX-NEXT:  .Lcfi481:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -15297,11 +14853,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15313,11 +14869,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -15354,12 +14914,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi592:
+; NoVLX-NEXT:  .Lcfi482:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi593:
+; NoVLX-NEXT:  .Lcfi483:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi594:
+; NoVLX-NEXT:  .Lcfi484:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -15368,24 +14928,20 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi595:
+; NoVLX-NEXT:  .Lcfi485:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi596:
+; NoVLX-NEXT:  .Lcfi486:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi597:
+; NoVLX-NEXT:  .Lcfi487:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi598:
+; NoVLX-NEXT:  .Lcfi488:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi599:
+; NoVLX-NEXT:  .Lcfi489:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -15428,11 +14984,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15444,11 +15000,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -15487,12 +15047,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi600:
+; NoVLX-NEXT:  .Lcfi490:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi601:
+; NoVLX-NEXT:  .Lcfi491:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi602:
+; NoVLX-NEXT:  .Lcfi492:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -15501,25 +15061,21 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi603:
+; NoVLX-NEXT:  .Lcfi493:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi604:
+; NoVLX-NEXT:  .Lcfi494:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi605:
+; NoVLX-NEXT:  .Lcfi495:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi606:
+; NoVLX-NEXT:  .Lcfi496:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi607:
+; NoVLX-NEXT:  .Lcfi497:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -15562,11 +15118,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15578,11 +15134,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -15622,12 +15182,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi608:
+; NoVLX-NEXT:  .Lcfi498:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi609:
+; NoVLX-NEXT:  .Lcfi499:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi610:
+; NoVLX-NEXT:  .Lcfi500:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -15636,25 +15196,21 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi611:
+; NoVLX-NEXT:  .Lcfi501:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi612:
+; NoVLX-NEXT:  .Lcfi502:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi613:
+; NoVLX-NEXT:  .Lcfi503:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi614:
+; NoVLX-NEXT:  .Lcfi504:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi615:
+; NoVLX-NEXT:  .Lcfi505:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -15697,11 +15253,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15713,11 +15269,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -15758,32 +15318,47 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi616:
+; NoVLX-NEXT:  .Lcfi506:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi617:
+; NoVLX-NEXT:  .Lcfi507:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi618:
+; NoVLX-NEXT:  .Lcfi508:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -15791,7 +15366,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -15801,6 +15377,27 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
@@ -15811,97 +15408,83 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
 ; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm3, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm8, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -15909,102 +15492,79 @@
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
@@ -16109,29 +15669,47 @@
 ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi619:
+; NoVLX-NEXT:  .Lcfi509:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi620:
+; NoVLX-NEXT:  .Lcfi510:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi621:
+; NoVLX-NEXT:  .Lcfi511:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16139,39 +15717,20 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -16179,7 +15738,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -16189,19 +15749,19 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm0
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
 ; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw 32(%rdi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
@@ -16377,12 +15937,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi622:
+; NoVLX-NEXT:  .Lcfi512:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi623:
+; NoVLX-NEXT:  .Lcfi513:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi624:
+; NoVLX-NEXT:  .Lcfi514:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -16393,17 +15953,12 @@
 ; NoVLX-NEXT:    movq %rax, %rdx
 ; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16411,9 +15966,10 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm4
@@ -16431,87 +15987,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -16521,9 +15998,50 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
@@ -16531,29 +16049,66 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm6
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm6
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm3
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm4
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16562,9 +16117,76 @@
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm4, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpcmpgtw %ymm4, %ymm1, %ymm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
@@ -16623,86 +16245,24 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -16740,12 +16300,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi625:
+; NoVLX-NEXT:  .Lcfi515:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi626:
+; NoVLX-NEXT:  .Lcfi516:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi627:
+; NoVLX-NEXT:  .Lcfi517:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -16757,8 +16317,6 @@
 ; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
@@ -16771,19 +16329,20 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16791,6 +16350,7 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
@@ -16813,112 +16373,40 @@
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm2, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %eax, %xmm2
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpcmpgtw 32(%rsi), %ymm3, %ymm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm4
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm3, %ymm3
 ; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
 ; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -16972,15 +16460,87 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm1
+; NoVLX-NEXT:    vpcmpgtw 32(%rsi), %ymm2, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -17035,8 +16595,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17090,8 +16650,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17130,7 +16690,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17142,13 +16701,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17165,8 +16725,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17242,8 +16802,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17303,8 +16863,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17345,7 +16905,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17357,13 +16916,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17380,8 +16940,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17440,8 +17000,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17494,8 +17054,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17534,7 +17094,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17546,13 +17105,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17568,8 +17128,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17644,8 +17204,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17704,8 +17264,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17746,7 +17306,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17758,13 +17317,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17780,8 +17340,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17824,12 +17384,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi628:
+; NoVLX-NEXT:  .Lcfi518:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi629:
+; NoVLX-NEXT:  .Lcfi519:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi630:
+; NoVLX-NEXT:  .Lcfi520:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -17867,12 +17427,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi631:
+; NoVLX-NEXT:  .Lcfi521:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi632:
+; NoVLX-NEXT:  .Lcfi522:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi633:
+; NoVLX-NEXT:  .Lcfi523:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -17912,16 +17472,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi634:
+; NoVLX-NEXT:  .Lcfi524:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi635:
+; NoVLX-NEXT:  .Lcfi525:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi636:
+; NoVLX-NEXT:  .Lcfi526:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17933,13 +17492,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -17977,12 +17537,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi637:
+; NoVLX-NEXT:  .Lcfi527:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi638:
+; NoVLX-NEXT:  .Lcfi528:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi639:
+; NoVLX-NEXT:  .Lcfi529:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18043,12 +17603,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi640:
+; NoVLX-NEXT:  .Lcfi530:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi641:
+; NoVLX-NEXT:  .Lcfi531:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi642:
+; NoVLX-NEXT:  .Lcfi532:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18090,17 +17650,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi643:
+; NoVLX-NEXT:  .Lcfi533:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi644:
+; NoVLX-NEXT:  .Lcfi534:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi645:
+; NoVLX-NEXT:  .Lcfi535:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -18112,13 +17671,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -18158,20 +17718,20 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi646:
+; NoVLX-NEXT:  .Lcfi536:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi647:
+; NoVLX-NEXT:  .Lcfi537:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi648:
+; NoVLX-NEXT:  .Lcfi538:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18207,20 +17767,20 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi649:
+; NoVLX-NEXT:  .Lcfi539:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi650:
+; NoVLX-NEXT:  .Lcfi540:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi651:
+; NoVLX-NEXT:  .Lcfi541:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18258,19 +17818,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi652:
+; NoVLX-NEXT:  .Lcfi542:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi653:
+; NoVLX-NEXT:  .Lcfi543:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi654:
+; NoVLX-NEXT:  .Lcfi544:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -18283,13 +17842,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18329,19 +17889,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi655:
+; NoVLX-NEXT:  .Lcfi545:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi656:
+; NoVLX-NEXT:  .Lcfi546:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi657:
+; NoVLX-NEXT:  .Lcfi547:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -18354,13 +17913,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18401,12 +17961,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi658:
+; NoVLX-NEXT:  .Lcfi548:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi659:
+; NoVLX-NEXT:  .Lcfi549:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi660:
+; NoVLX-NEXT:  .Lcfi550:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -18414,8 +17974,8 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18454,12 +18014,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi661:
+; NoVLX-NEXT:  .Lcfi551:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi662:
+; NoVLX-NEXT:  .Lcfi552:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi663:
+; NoVLX-NEXT:  .Lcfi553:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -18467,7 +18027,6 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -18480,13 +18039,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18717,12 +18277,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi664:
+; NoVLX-NEXT:  .Lcfi554:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi665:
+; NoVLX-NEXT:  .Lcfi555:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi666:
+; NoVLX-NEXT:  .Lcfi556:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18792,12 +18352,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi667:
+; NoVLX-NEXT:  .Lcfi557:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi668:
+; NoVLX-NEXT:  .Lcfi558:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi669:
+; NoVLX-NEXT:  .Lcfi559:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18869,12 +18429,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi670:
+; NoVLX-NEXT:  .Lcfi560:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi671:
+; NoVLX-NEXT:  .Lcfi561:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi672:
+; NoVLX-NEXT:  .Lcfi562:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18949,12 +18509,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi673:
+; NoVLX-NEXT:  .Lcfi563:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi674:
+; NoVLX-NEXT:  .Lcfi564:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi675:
+; NoVLX-NEXT:  .Lcfi565:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -19030,12 +18590,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi676:
+; NoVLX-NEXT:  .Lcfi566:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi677:
+; NoVLX-NEXT:  .Lcfi567:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi678:
+; NoVLX-NEXT:  .Lcfi568:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -19108,12 +18668,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi679:
+; NoVLX-NEXT:  .Lcfi569:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi680:
+; NoVLX-NEXT:  .Lcfi570:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi681:
+; NoVLX-NEXT:  .Lcfi571:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -19190,55 +18750,55 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi682:
+; NoVLX-NEXT:  .Lcfi572:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi683:
+; NoVLX-NEXT:  .Lcfi573:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi684:
+; NoVLX-NEXT:  .Lcfi574:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19270,55 +18830,55 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi685:
+; NoVLX-NEXT:  .Lcfi575:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi686:
+; NoVLX-NEXT:  .Lcfi576:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi687:
+; NoVLX-NEXT:  .Lcfi577:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19352,12 +18912,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi688:
+; NoVLX-NEXT:  .Lcfi578:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi689:
+; NoVLX-NEXT:  .Lcfi579:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi690:
+; NoVLX-NEXT:  .Lcfi580:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -19366,43 +18926,43 @@
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19437,12 +18997,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi691:
+; NoVLX-NEXT:  .Lcfi581:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi692:
+; NoVLX-NEXT:  .Lcfi582:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi693:
+; NoVLX-NEXT:  .Lcfi583:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -19451,43 +19011,43 @@
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19523,55 +19083,55 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi694:
+; NoVLX-NEXT:  .Lcfi584:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi695:
+; NoVLX-NEXT:  .Lcfi585:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi696:
+; NoVLX-NEXT:  .Lcfi586:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19606,12 +19166,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi697:
+; NoVLX-NEXT:  .Lcfi587:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi698:
+; NoVLX-NEXT:  .Lcfi588:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi699:
+; NoVLX-NEXT:  .Lcfi589:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -19620,43 +19180,43 @@
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k0, %k1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19693,93 +19253,78 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi700:
+; NoVLX-NEXT:  .Lcfi590:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi701:
+; NoVLX-NEXT:  .Lcfi591:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi702:
+; NoVLX-NEXT:  .Lcfi592:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi703:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi704:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi705:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi706:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi707:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -19788,12 +19333,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -19816,93 +19356,78 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi708:
+; NoVLX-NEXT:  .Lcfi593:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi709:
+; NoVLX-NEXT:  .Lcfi594:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi710:
+; NoVLX-NEXT:  .Lcfi595:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi711:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi712:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi713:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi714:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi715:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -19911,12 +19436,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -19941,94 +19461,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi716:
+; NoVLX-NEXT:  .Lcfi596:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi717:
+; NoVLX-NEXT:  .Lcfi597:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi718:
+; NoVLX-NEXT:  .Lcfi598:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi719:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi720:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi721:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi722:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi723:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20037,12 +19542,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -20068,94 +19568,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi724:
+; NoVLX-NEXT:  .Lcfi599:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi725:
+; NoVLX-NEXT:  .Lcfi600:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi726:
+; NoVLX-NEXT:  .Lcfi601:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi727:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi728:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi729:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi730:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi731:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20164,12 +19649,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -20196,93 +19676,78 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi732:
+; NoVLX-NEXT:  .Lcfi602:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi733:
+; NoVLX-NEXT:  .Lcfi603:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi734:
+; NoVLX-NEXT:  .Lcfi604:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi735:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi736:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi737:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi738:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi739:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20291,12 +19756,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -20322,94 +19782,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi740:
+; NoVLX-NEXT:  .Lcfi605:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi741:
+; NoVLX-NEXT:  .Lcfi606:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi742:
+; NoVLX-NEXT:  .Lcfi607:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi743:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi744:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi745:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi746:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi747:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20418,12 +19863,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -20451,12 +19891,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi748:
+; NoVLX-NEXT:  .Lcfi608:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi749:
+; NoVLX-NEXT:  .Lcfi609:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi750:
+; NoVLX-NEXT:  .Lcfi610:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20465,21 +19905,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi751:
+; NoVLX-NEXT:  .Lcfi611:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi752:
+; NoVLX-NEXT:  .Lcfi612:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi753:
+; NoVLX-NEXT:  .Lcfi613:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi754:
+; NoVLX-NEXT:  .Lcfi614:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi755:
+; NoVLX-NEXT:  .Lcfi615:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -20522,11 +19958,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20538,11 +19974,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -20579,12 +20019,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi756:
+; NoVLX-NEXT:  .Lcfi616:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi757:
+; NoVLX-NEXT:  .Lcfi617:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi758:
+; NoVLX-NEXT:  .Lcfi618:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20593,21 +20033,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi759:
+; NoVLX-NEXT:  .Lcfi619:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi760:
+; NoVLX-NEXT:  .Lcfi620:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi761:
+; NoVLX-NEXT:  .Lcfi621:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi762:
+; NoVLX-NEXT:  .Lcfi622:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi763:
+; NoVLX-NEXT:  .Lcfi623:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -20650,11 +20086,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20666,11 +20102,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -20709,12 +20149,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi764:
+; NoVLX-NEXT:  .Lcfi624:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi765:
+; NoVLX-NEXT:  .Lcfi625:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi766:
+; NoVLX-NEXT:  .Lcfi626:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20723,22 +20163,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi767:
+; NoVLX-NEXT:  .Lcfi627:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi768:
+; NoVLX-NEXT:  .Lcfi628:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi769:
+; NoVLX-NEXT:  .Lcfi629:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi770:
+; NoVLX-NEXT:  .Lcfi630:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi771:
+; NoVLX-NEXT:  .Lcfi631:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -20781,11 +20217,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20797,11 +20233,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -20841,12 +20281,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi772:
+; NoVLX-NEXT:  .Lcfi632:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi773:
+; NoVLX-NEXT:  .Lcfi633:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi774:
+; NoVLX-NEXT:  .Lcfi634:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20855,22 +20295,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi775:
+; NoVLX-NEXT:  .Lcfi635:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi776:
+; NoVLX-NEXT:  .Lcfi636:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi777:
+; NoVLX-NEXT:  .Lcfi637:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi778:
+; NoVLX-NEXT:  .Lcfi638:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi779:
+; NoVLX-NEXT:  .Lcfi639:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -20913,11 +20349,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20929,11 +20365,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -20974,12 +20414,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi780:
+; NoVLX-NEXT:  .Lcfi640:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi781:
+; NoVLX-NEXT:  .Lcfi641:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi782:
+; NoVLX-NEXT:  .Lcfi642:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20988,21 +20428,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi783:
+; NoVLX-NEXT:  .Lcfi643:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi784:
+; NoVLX-NEXT:  .Lcfi644:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi785:
+; NoVLX-NEXT:  .Lcfi645:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi786:
+; NoVLX-NEXT:  .Lcfi646:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi787:
+; NoVLX-NEXT:  .Lcfi647:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -21045,11 +20481,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -21061,11 +20497,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -21105,12 +20545,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi788:
+; NoVLX-NEXT:  .Lcfi648:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi789:
+; NoVLX-NEXT:  .Lcfi649:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi790:
+; NoVLX-NEXT:  .Lcfi650:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -21119,22 +20559,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi791:
+; NoVLX-NEXT:  .Lcfi651:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi792:
+; NoVLX-NEXT:  .Lcfi652:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi793:
+; NoVLX-NEXT:  .Lcfi653:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi794:
+; NoVLX-NEXT:  .Lcfi654:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi795:
+; NoVLX-NEXT:  .Lcfi655:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -21177,11 +20613,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -21193,11 +20629,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -21294,7 +20734,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -21302,9 +20741,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -21409,7 +20849,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -21417,9 +20856,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -22018,12 +21458,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi796:
+; NoVLX-NEXT:  .Lcfi656:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi797:
+; NoVLX-NEXT:  .Lcfi657:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi798:
+; NoVLX-NEXT:  .Lcfi658:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -22061,12 +21501,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi799:
+; NoVLX-NEXT:  .Lcfi659:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi800:
+; NoVLX-NEXT:  .Lcfi660:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi801:
+; NoVLX-NEXT:  .Lcfi661:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -22106,16 +21546,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi802:
+; NoVLX-NEXT:  .Lcfi662:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi803:
+; NoVLX-NEXT:  .Lcfi663:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi804:
+; NoVLX-NEXT:  .Lcfi664:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -22123,9 +21562,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -22163,12 +21603,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi805:
+; NoVLX-NEXT:  .Lcfi665:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi806:
+; NoVLX-NEXT:  .Lcfi666:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi807:
+; NoVLX-NEXT:  .Lcfi667:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -22221,12 +21661,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi808:
+; NoVLX-NEXT:  .Lcfi668:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi809:
+; NoVLX-NEXT:  .Lcfi669:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi810:
+; NoVLX-NEXT:  .Lcfi670:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -22268,17 +21708,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi811:
+; NoVLX-NEXT:  .Lcfi671:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi812:
+; NoVLX-NEXT:  .Lcfi672:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi813:
+; NoVLX-NEXT:  .Lcfi673:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -22286,9 +21725,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -22328,20 +21768,20 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi814:
+; NoVLX-NEXT:  .Lcfi674:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi815:
+; NoVLX-NEXT:  .Lcfi675:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi816:
+; NoVLX-NEXT:  .Lcfi676:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22377,20 +21817,20 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi817:
+; NoVLX-NEXT:  .Lcfi677:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi818:
+; NoVLX-NEXT:  .Lcfi678:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi819:
+; NoVLX-NEXT:  .Lcfi679:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22428,12 +21868,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi820:
+; NoVLX-NEXT:  .Lcfi680:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi821:
+; NoVLX-NEXT:  .Lcfi681:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi822:
+; NoVLX-NEXT:  .Lcfi682:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -22450,8 +21890,8 @@
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22491,12 +21931,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi823:
+; NoVLX-NEXT:  .Lcfi683:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi824:
+; NoVLX-NEXT:  .Lcfi684:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi825:
+; NoVLX-NEXT:  .Lcfi685:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -22513,8 +21953,8 @@
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22555,12 +21995,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi826:
+; NoVLX-NEXT:  .Lcfi686:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi827:
+; NoVLX-NEXT:  .Lcfi687:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi828:
+; NoVLX-NEXT:  .Lcfi688:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -22568,8 +22008,8 @@
 ; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22608,12 +22048,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi829:
+; NoVLX-NEXT:  .Lcfi689:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi830:
+; NoVLX-NEXT:  .Lcfi690:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi831:
+; NoVLX-NEXT:  .Lcfi691:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -22631,8 +22071,8 @@
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22693,8 +22133,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22750,8 +22190,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22792,7 +22232,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -22810,6 +22249,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -22827,8 +22267,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22871,7 +22311,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -22889,6 +22328,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -22906,8 +22346,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22969,8 +22409,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -23013,7 +22453,6 @@
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23031,6 +22470,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -23048,8 +22488,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -23110,8 +22550,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23166,8 +22606,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23208,7 +22648,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23226,6 +22665,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -23242,8 +22682,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23286,7 +22726,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23304,6 +22743,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -23320,8 +22760,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23382,8 +22822,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23426,7 +22866,6 @@
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23444,6 +22883,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -23460,8 +22900,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23505,12 +22945,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi832:
+; NoVLX-NEXT:  .Lcfi692:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi833:
+; NoVLX-NEXT:  .Lcfi693:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi834:
+; NoVLX-NEXT:  .Lcfi694:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -23550,12 +22990,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi835:
+; NoVLX-NEXT:  .Lcfi695:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi836:
+; NoVLX-NEXT:  .Lcfi696:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi837:
+; NoVLX-NEXT:  .Lcfi697:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -23597,17 +23037,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi838:
+; NoVLX-NEXT:  .Lcfi698:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi839:
+; NoVLX-NEXT:  .Lcfi699:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi840:
+; NoVLX-NEXT:  .Lcfi700:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23625,6 +23064,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -23664,17 +23104,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi841:
+; NoVLX-NEXT:  .Lcfi701:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi842:
+; NoVLX-NEXT:  .Lcfi702:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi843:
+; NoVLX-NEXT:  .Lcfi703:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23692,6 +23131,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -23732,12 +23172,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi844:
+; NoVLX-NEXT:  .Lcfi704:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi845:
+; NoVLX-NEXT:  .Lcfi705:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi846:
+; NoVLX-NEXT:  .Lcfi706:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -23781,18 +23221,17 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi847:
+; NoVLX-NEXT:  .Lcfi707:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi848:
+; NoVLX-NEXT:  .Lcfi708:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi849:
+; NoVLX-NEXT:  .Lcfi709:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23810,6 +23249,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -23851,12 +23291,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi850:
+; NoVLX-NEXT:  .Lcfi710:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi851:
+; NoVLX-NEXT:  .Lcfi711:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi852:
+; NoVLX-NEXT:  .Lcfi712:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -23864,8 +23304,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -23902,12 +23342,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi853:
+; NoVLX-NEXT:  .Lcfi713:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi854:
+; NoVLX-NEXT:  .Lcfi714:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi855:
+; NoVLX-NEXT:  .Lcfi715:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -23915,8 +23355,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -23955,12 +23395,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi856:
+; NoVLX-NEXT:  .Lcfi716:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi857:
+; NoVLX-NEXT:  .Lcfi717:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi858:
+; NoVLX-NEXT:  .Lcfi718:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -23968,7 +23408,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -23981,13 +23420,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -24028,12 +23468,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi859:
+; NoVLX-NEXT:  .Lcfi719:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi860:
+; NoVLX-NEXT:  .Lcfi720:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi861:
+; NoVLX-NEXT:  .Lcfi721:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -24041,7 +23481,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -24054,13 +23493,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -24102,12 +23542,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi862:
+; NoVLX-NEXT:  .Lcfi722:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi863:
+; NoVLX-NEXT:  .Lcfi723:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi864:
+; NoVLX-NEXT:  .Lcfi724:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -24116,8 +23556,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -24157,12 +23597,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi865:
+; NoVLX-NEXT:  .Lcfi725:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi866:
+; NoVLX-NEXT:  .Lcfi726:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi867:
+; NoVLX-NEXT:  .Lcfi727:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -24171,7 +23611,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -24184,13 +23623,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -24397,12 +23837,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi868:
+; NoVLX-NEXT:  .Lcfi728:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi869:
+; NoVLX-NEXT:  .Lcfi729:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi870:
+; NoVLX-NEXT:  .Lcfi730:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24470,12 +23910,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi871:
+; NoVLX-NEXT:  .Lcfi731:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi872:
+; NoVLX-NEXT:  .Lcfi732:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi873:
+; NoVLX-NEXT:  .Lcfi733:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24545,12 +23985,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi874:
+; NoVLX-NEXT:  .Lcfi734:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi875:
+; NoVLX-NEXT:  .Lcfi735:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi876:
+; NoVLX-NEXT:  .Lcfi736:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24622,12 +24062,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi877:
+; NoVLX-NEXT:  .Lcfi737:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi878:
+; NoVLX-NEXT:  .Lcfi738:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi879:
+; NoVLX-NEXT:  .Lcfi739:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24700,12 +24140,12 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi880:
+; NoVLX-NEXT:  .Lcfi740:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi881:
+; NoVLX-NEXT:  .Lcfi741:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi882:
+; NoVLX-NEXT:  .Lcfi742:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24776,12 +24216,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi883:
+; NoVLX-NEXT:  .Lcfi743:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi884:
+; NoVLX-NEXT:  .Lcfi744:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi885:
+; NoVLX-NEXT:  .Lcfi745:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24855,53 +24295,53 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi886:
+; NoVLX-NEXT:  .Lcfi746:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi887:
+; NoVLX-NEXT:  .Lcfi747:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi888:
+; NoVLX-NEXT:  .Lcfi748:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -24933,53 +24373,53 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi889:
+; NoVLX-NEXT:  .Lcfi749:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi890:
+; NoVLX-NEXT:  .Lcfi750:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi891:
+; NoVLX-NEXT:  .Lcfi751:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25013,54 +24453,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi892:
+; NoVLX-NEXT:  .Lcfi752:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi893:
+; NoVLX-NEXT:  .Lcfi753:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi894:
+; NoVLX-NEXT:  .Lcfi754:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25095,54 +24535,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi895:
+; NoVLX-NEXT:  .Lcfi755:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi896:
+; NoVLX-NEXT:  .Lcfi756:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi897:
+; NoVLX-NEXT:  .Lcfi757:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25178,53 +24618,53 @@
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi898:
+; NoVLX-NEXT:  .Lcfi758:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi899:
+; NoVLX-NEXT:  .Lcfi759:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi900:
+; NoVLX-NEXT:  .Lcfi760:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25259,54 +24699,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi901:
+; NoVLX-NEXT:  .Lcfi761:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi902:
+; NoVLX-NEXT:  .Lcfi762:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi903:
+; NoVLX-NEXT:  .Lcfi763:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25342,30 +24782,15 @@
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi904:
+; NoVLX-NEXT:  .Lcfi764:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi905:
+; NoVLX-NEXT:  .Lcfi765:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi906:
+; NoVLX-NEXT:  .Lcfi766:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi907:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi908:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi909:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi910:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi911:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -25376,64 +24801,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25442,12 +24867,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -25469,30 +24889,15 @@
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi912:
+; NoVLX-NEXT:  .Lcfi767:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi913:
+; NoVLX-NEXT:  .Lcfi768:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi914:
+; NoVLX-NEXT:  .Lcfi769:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi915:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi916:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi917:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi918:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi919:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25504,64 +24909,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25570,12 +24975,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -25599,30 +24999,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi920:
+; NoVLX-NEXT:  .Lcfi770:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi921:
+; NoVLX-NEXT:  .Lcfi771:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi922:
+; NoVLX-NEXT:  .Lcfi772:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi923:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi924:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi925:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi926:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi927:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -25634,64 +25019,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25700,12 +25085,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -25730,30 +25110,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi928:
+; NoVLX-NEXT:  .Lcfi773:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi929:
+; NoVLX-NEXT:  .Lcfi774:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi930:
+; NoVLX-NEXT:  .Lcfi775:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi931:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi932:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi933:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi934:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi935:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25766,64 +25131,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25832,12 +25197,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -25863,12 +25223,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi936:
+; NoVLX-NEXT:  .Lcfi776:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi937:
+; NoVLX-NEXT:  .Lcfi777:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi938:
+; NoVLX-NEXT:  .Lcfi778:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -25877,15 +25237,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi939:
+; NoVLX-NEXT:  .Lcfi779:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi940:
+; NoVLX-NEXT:  .Lcfi780:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi941:
+; NoVLX-NEXT:  .Lcfi781:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi942:
+; NoVLX-NEXT:  .Lcfi782:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi943:
+; NoVLX-NEXT:  .Lcfi783:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25893,10 +25253,6 @@
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -25939,11 +25295,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -25955,11 +25311,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25995,12 +25355,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi944:
+; NoVLX-NEXT:  .Lcfi784:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi945:
+; NoVLX-NEXT:  .Lcfi785:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi946:
+; NoVLX-NEXT:  .Lcfi786:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -26009,15 +25369,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi947:
+; NoVLX-NEXT:  .Lcfi787:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi948:
+; NoVLX-NEXT:  .Lcfi788:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi949:
+; NoVLX-NEXT:  .Lcfi789:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi950:
+; NoVLX-NEXT:  .Lcfi790:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi951:
+; NoVLX-NEXT:  .Lcfi791:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
@@ -26026,10 +25386,6 @@
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -26072,11 +25428,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26088,11 +25444,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -26130,12 +25490,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi952:
+; NoVLX-NEXT:  .Lcfi792:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi953:
+; NoVLX-NEXT:  .Lcfi793:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi954:
+; NoVLX-NEXT:  .Lcfi794:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -26144,15 +25504,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi955:
+; NoVLX-NEXT:  .Lcfi795:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi956:
+; NoVLX-NEXT:  .Lcfi796:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi957:
+; NoVLX-NEXT:  .Lcfi797:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi958:
+; NoVLX-NEXT:  .Lcfi798:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi959:
+; NoVLX-NEXT:  .Lcfi799:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -26161,10 +25521,6 @@
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -26207,11 +25563,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26223,11 +25579,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -26266,12 +25626,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi960:
+; NoVLX-NEXT:  .Lcfi800:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi961:
+; NoVLX-NEXT:  .Lcfi801:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi962:
+; NoVLX-NEXT:  .Lcfi802:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -26280,15 +25640,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi963:
+; NoVLX-NEXT:  .Lcfi803:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi964:
+; NoVLX-NEXT:  .Lcfi804:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi965:
+; NoVLX-NEXT:  .Lcfi805:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi966:
+; NoVLX-NEXT:  .Lcfi806:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi967:
+; NoVLX-NEXT:  .Lcfi807:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
@@ -26298,10 +25658,6 @@
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -26344,11 +25700,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26360,11 +25716,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -26405,12 +25765,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi968:
+; NoVLX-NEXT:  .Lcfi808:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi969:
+; NoVLX-NEXT:  .Lcfi809:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi970:
+; NoVLX-NEXT:  .Lcfi810:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -26456,12 +25816,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi971:
+; NoVLX-NEXT:  .Lcfi811:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi972:
+; NoVLX-NEXT:  .Lcfi812:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi973:
+; NoVLX-NEXT:  .Lcfi813:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -26510,12 +25870,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi974:
+; NoVLX-NEXT:  .Lcfi814:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi975:
+; NoVLX-NEXT:  .Lcfi815:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi976:
+; NoVLX-NEXT:  .Lcfi816:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -26573,12 +25933,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi977:
+; NoVLX-NEXT:  .Lcfi817:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi978:
+; NoVLX-NEXT:  .Lcfi818:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi979:
+; NoVLX-NEXT:  .Lcfi819:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -26770,12 +26130,12 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi980:
+; NoVLX-NEXT:  .Lcfi820:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi981:
+; NoVLX-NEXT:  .Lcfi821:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi982:
+; NoVLX-NEXT:  .Lcfi822:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -26847,12 +26207,12 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi983:
+; NoVLX-NEXT:  .Lcfi823:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi984:
+; NoVLX-NEXT:  .Lcfi824:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi985:
+; NoVLX-NEXT:  .Lcfi825:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -26927,12 +26287,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi986:
+; NoVLX-NEXT:  .Lcfi826:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi987:
+; NoVLX-NEXT:  .Lcfi827:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi988:
+; NoVLX-NEXT:  .Lcfi828:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -27008,12 +26368,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi989:
+; NoVLX-NEXT:  .Lcfi829:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi990:
+; NoVLX-NEXT:  .Lcfi830:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi991:
+; NoVLX-NEXT:  .Lcfi831:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -27091,12 +26451,12 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi992:
+; NoVLX-NEXT:  .Lcfi832:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi993:
+; NoVLX-NEXT:  .Lcfi833:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi994:
+; NoVLX-NEXT:  .Lcfi834:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -27106,43 +26466,43 @@
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -27173,12 +26533,12 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi995:
+; NoVLX-NEXT:  .Lcfi835:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi996:
+; NoVLX-NEXT:  .Lcfi836:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi997:
+; NoVLX-NEXT:  .Lcfi837:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -27189,43 +26549,43 @@
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -27258,12 +26618,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi998:
+; NoVLX-NEXT:  .Lcfi838:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi999:
+; NoVLX-NEXT:  .Lcfi839:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1000:
+; NoVLX-NEXT:  .Lcfi840:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -27274,43 +26634,43 @@
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -27344,12 +26704,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1001:
+; NoVLX-NEXT:  .Lcfi841:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1002:
+; NoVLX-NEXT:  .Lcfi842:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1003:
+; NoVLX-NEXT:  .Lcfi843:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -27361,43 +26721,43 @@
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -27433,30 +26793,15 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1004:
+; NoVLX-NEXT:  .Lcfi844:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1005:
+; NoVLX-NEXT:  .Lcfi845:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1006:
+; NoVLX-NEXT:  .Lcfi846:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1007:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1008:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1009:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1010:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1011:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
@@ -27467,64 +26812,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27533,12 +26878,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -27561,30 +26901,15 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1012:
+; NoVLX-NEXT:  .Lcfi847:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1013:
+; NoVLX-NEXT:  .Lcfi848:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1014:
+; NoVLX-NEXT:  .Lcfi849:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1015:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1016:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1017:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1018:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1019:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27596,64 +26921,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27662,12 +26987,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -27692,30 +27012,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1020:
+; NoVLX-NEXT:  .Lcfi850:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1021:
+; NoVLX-NEXT:  .Lcfi851:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1022:
+; NoVLX-NEXT:  .Lcfi852:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1023:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1024:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1025:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1026:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1027:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
@@ -27727,64 +27032,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27793,12 +27098,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -27824,30 +27124,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1028:
+; NoVLX-NEXT:  .Lcfi853:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1029:
+; NoVLX-NEXT:  .Lcfi854:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1030:
+; NoVLX-NEXT:  .Lcfi855:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1031:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1032:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1033:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1034:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1035:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27860,64 +27145,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27926,12 +27211,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -27958,12 +27238,12 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1036:
+; NoVLX-NEXT:  .Lcfi856:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1037:
+; NoVLX-NEXT:  .Lcfi857:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1038:
+; NoVLX-NEXT:  .Lcfi858:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -27972,15 +27252,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1039:
+; NoVLX-NEXT:  .Lcfi859:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1040:
+; NoVLX-NEXT:  .Lcfi860:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1041:
+; NoVLX-NEXT:  .Lcfi861:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1042:
+; NoVLX-NEXT:  .Lcfi862:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1043:
+; NoVLX-NEXT:  .Lcfi863:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27988,10 +27268,6 @@
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -28034,11 +27310,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28050,11 +27326,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -28091,12 +27371,12 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1044:
+; NoVLX-NEXT:  .Lcfi864:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1045:
+; NoVLX-NEXT:  .Lcfi865:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1046:
+; NoVLX-NEXT:  .Lcfi866:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -28105,15 +27385,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1047:
+; NoVLX-NEXT:  .Lcfi867:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1048:
+; NoVLX-NEXT:  .Lcfi868:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1049:
+; NoVLX-NEXT:  .Lcfi869:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1050:
+; NoVLX-NEXT:  .Lcfi870:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1051:
+; NoVLX-NEXT:  .Lcfi871:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
@@ -28122,10 +27402,6 @@
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -28168,11 +27444,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28184,11 +27460,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -28227,12 +27507,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1052:
+; NoVLX-NEXT:  .Lcfi872:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1053:
+; NoVLX-NEXT:  .Lcfi873:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1054:
+; NoVLX-NEXT:  .Lcfi874:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -28241,15 +27521,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1055:
+; NoVLX-NEXT:  .Lcfi875:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1056:
+; NoVLX-NEXT:  .Lcfi876:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1057:
+; NoVLX-NEXT:  .Lcfi877:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1058:
+; NoVLX-NEXT:  .Lcfi878:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1059:
+; NoVLX-NEXT:  .Lcfi879:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -28258,10 +27538,6 @@
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -28304,11 +27580,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28320,11 +27596,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -28364,12 +27644,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1060:
+; NoVLX-NEXT:  .Lcfi880:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1061:
+; NoVLX-NEXT:  .Lcfi881:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1062:
+; NoVLX-NEXT:  .Lcfi882:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -28378,15 +27658,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1063:
+; NoVLX-NEXT:  .Lcfi883:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1064:
+; NoVLX-NEXT:  .Lcfi884:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1065:
+; NoVLX-NEXT:  .Lcfi885:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1066:
+; NoVLX-NEXT:  .Lcfi886:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1067:
+; NoVLX-NEXT:  .Lcfi887:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
@@ -28396,10 +27676,6 @@
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -28442,11 +27718,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28458,11 +27734,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -28503,32 +27783,47 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1068:
+; NoVLX-NEXT:  .Lcfi888:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1069:
+; NoVLX-NEXT:  .Lcfi889:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1070:
+; NoVLX-NEXT:  .Lcfi890:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -28536,7 +27831,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -28546,39 +27842,82 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm5
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
 ; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -28594,94 +27933,35 @@
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm7, %xmm7
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rcx
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm0
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm7, %xmm4
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm2
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -28857,29 +28137,47 @@
 ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1071:
+; NoVLX-NEXT:  .Lcfi891:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1072:
+; NoVLX-NEXT:  .Lcfi892:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1073:
+; NoVLX-NEXT:  .Lcfi893:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -28887,39 +28185,20 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -28927,7 +28206,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -28937,24 +28217,24 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vmovdqa (%rdi), %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
-; NoVLX-NEXT:    vmovdqa 32(%rdi), %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm2
+; NoVLX-NEXT:    vmovdqa (%rdi), %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vmovdqa 32(%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm2
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm2, %ymm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
@@ -29130,12 +28410,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1074:
+; NoVLX-NEXT:  .Lcfi894:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1075:
+; NoVLX-NEXT:  .Lcfi895:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1076:
+; NoVLX-NEXT:  .Lcfi896:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -29146,17 +28426,12 @@
 ; NoVLX-NEXT:    movq %rax, %rdx
 ; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -29164,59 +28439,61 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
@@ -29224,7 +28501,8 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -29234,29 +28512,30 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm7, %xmm7
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -29264,47 +28543,18 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm4
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -29316,7 +28566,31 @@
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vmovd %eax, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rcx
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm7, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpcmpgtw %ymm4, %ymm2, %ymm2
 ; NoVLX-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
 ; NoVLX-NEXT:    vpxor %ymm4, %ymm2, %ymm2
@@ -29383,77 +28657,83 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
@@ -29496,12 +28776,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1077:
+; NoVLX-NEXT:  .Lcfi897:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1078:
+; NoVLX-NEXT:  .Lcfi898:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1079:
+; NoVLX-NEXT:  .Lcfi899:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -29513,8 +28793,6 @@
 ; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
@@ -29527,19 +28805,20 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -29547,6 +28826,7 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
@@ -29588,21 +28868,84 @@
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm4
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
 ; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; NoVLX-NEXT:    vmovdqa (%rsi), %ymm4
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm4, %ymm5
-; NoVLX-NEXT:    vmovdqa 32(%rsi), %ymm3
-; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm3
-; NoVLX-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; NoVLX-NEXT:    vpxor %ymm4, %ymm5, %ymm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vmovdqa (%rsi), %ymm3
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; NoVLX-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovdqa 32(%rsi), %ymm4
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm4, %ymm2
+; NoVLX-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
@@ -29661,87 +29004,24 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -29798,8 +29078,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29856,8 +29136,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29896,7 +29176,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -29908,13 +29187,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -29931,8 +29211,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29974,7 +29254,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -29986,13 +29265,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30009,8 +29289,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30073,8 +29353,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30116,7 +29396,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30128,13 +29407,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30151,8 +29431,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30213,8 +29493,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30270,8 +29550,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30310,7 +29590,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30322,13 +29601,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30344,8 +29624,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30387,7 +29667,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30399,13 +29678,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30421,8 +29701,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30484,8 +29764,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30527,7 +29807,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30539,13 +29818,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30561,8 +29841,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30605,12 +29885,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1080:
+; NoVLX-NEXT:  .Lcfi900:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1081:
+; NoVLX-NEXT:  .Lcfi901:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1082:
+; NoVLX-NEXT:  .Lcfi902:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -30650,12 +29930,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1083:
+; NoVLX-NEXT:  .Lcfi903:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1084:
+; NoVLX-NEXT:  .Lcfi904:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1085:
+; NoVLX-NEXT:  .Lcfi905:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -30698,16 +29978,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1086:
+; NoVLX-NEXT:  .Lcfi906:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1087:
+; NoVLX-NEXT:  .Lcfi907:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1088:
+; NoVLX-NEXT:  .Lcfi908:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30719,13 +29998,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -30763,17 +30043,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1089:
+; NoVLX-NEXT:  .Lcfi909:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1090:
+; NoVLX-NEXT:  .Lcfi910:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1091:
+; NoVLX-NEXT:  .Lcfi911:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30785,13 +30064,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -30831,12 +30111,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1092:
+; NoVLX-NEXT:  .Lcfi912:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1093:
+; NoVLX-NEXT:  .Lcfi913:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1094:
+; NoVLX-NEXT:  .Lcfi914:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -30881,17 +30161,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1095:
+; NoVLX-NEXT:  .Lcfi915:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1096:
+; NoVLX-NEXT:  .Lcfi916:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1097:
+; NoVLX-NEXT:  .Lcfi917:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30903,13 +30182,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -30949,12 +30229,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1098:
+; NoVLX-NEXT:  .Lcfi918:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1099:
+; NoVLX-NEXT:  .Lcfi919:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1100:
+; NoVLX-NEXT:  .Lcfi920:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -30963,8 +30243,8 @@
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31000,12 +30280,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1101:
+; NoVLX-NEXT:  .Lcfi921:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1102:
+; NoVLX-NEXT:  .Lcfi922:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1103:
+; NoVLX-NEXT:  .Lcfi923:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -31015,8 +30295,8 @@
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31054,19 +30334,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1104:
+; NoVLX-NEXT:  .Lcfi924:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1105:
+; NoVLX-NEXT:  .Lcfi925:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1106:
+; NoVLX-NEXT:  .Lcfi926:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -31079,13 +30358,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31125,12 +30405,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1107:
+; NoVLX-NEXT:  .Lcfi927:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1108:
+; NoVLX-NEXT:  .Lcfi928:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1109:
+; NoVLX-NEXT:  .Lcfi929:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -31138,7 +30418,6 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -31151,13 +30430,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31199,12 +30479,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1110:
+; NoVLX-NEXT:  .Lcfi930:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1111:
+; NoVLX-NEXT:  .Lcfi931:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1112:
+; NoVLX-NEXT:  .Lcfi932:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -31214,8 +30494,8 @@
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31255,12 +30535,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1113:
+; NoVLX-NEXT:  .Lcfi933:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1114:
+; NoVLX-NEXT:  .Lcfi934:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1115:
+; NoVLX-NEXT:  .Lcfi935:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -31268,7 +30548,6 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -31281,13 +30560,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31520,12 +30800,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1116:
+; NoVLX-NEXT:  .Lcfi936:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1117:
+; NoVLX-NEXT:  .Lcfi937:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1118:
+; NoVLX-NEXT:  .Lcfi938:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31595,12 +30875,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1119:
+; NoVLX-NEXT:  .Lcfi939:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1120:
+; NoVLX-NEXT:  .Lcfi940:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1121:
+; NoVLX-NEXT:  .Lcfi941:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31672,12 +30952,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1122:
+; NoVLX-NEXT:  .Lcfi942:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1123:
+; NoVLX-NEXT:  .Lcfi943:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1124:
+; NoVLX-NEXT:  .Lcfi944:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31752,12 +31032,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1125:
+; NoVLX-NEXT:  .Lcfi945:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1126:
+; NoVLX-NEXT:  .Lcfi946:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1127:
+; NoVLX-NEXT:  .Lcfi947:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31834,12 +31114,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1128:
+; NoVLX-NEXT:  .Lcfi948:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1129:
+; NoVLX-NEXT:  .Lcfi949:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1130:
+; NoVLX-NEXT:  .Lcfi950:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31913,12 +31193,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1131:
+; NoVLX-NEXT:  .Lcfi951:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1132:
+; NoVLX-NEXT:  .Lcfi952:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1133:
+; NoVLX-NEXT:  .Lcfi953:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31995,55 +31275,55 @@
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1134:
+; NoVLX-NEXT:  .Lcfi954:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1135:
+; NoVLX-NEXT:  .Lcfi955:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1136:
+; NoVLX-NEXT:  .Lcfi956:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32075,55 +31355,55 @@
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1137:
+; NoVLX-NEXT:  .Lcfi957:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1138:
+; NoVLX-NEXT:  .Lcfi958:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1139:
+; NoVLX-NEXT:  .Lcfi959:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32157,12 +31437,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1140:
+; NoVLX-NEXT:  .Lcfi960:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1141:
+; NoVLX-NEXT:  .Lcfi961:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1142:
+; NoVLX-NEXT:  .Lcfi962:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -32171,43 +31451,43 @@
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32242,12 +31522,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1143:
+; NoVLX-NEXT:  .Lcfi963:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1144:
+; NoVLX-NEXT:  .Lcfi964:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1145:
+; NoVLX-NEXT:  .Lcfi965:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -32256,43 +31536,43 @@
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32329,55 +31609,55 @@
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1146:
+; NoVLX-NEXT:  .Lcfi966:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1147:
+; NoVLX-NEXT:  .Lcfi967:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1148:
+; NoVLX-NEXT:  .Lcfi968:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32413,12 +31693,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1149:
+; NoVLX-NEXT:  .Lcfi969:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1150:
+; NoVLX-NEXT:  .Lcfi970:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1151:
+; NoVLX-NEXT:  .Lcfi971:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -32427,43 +31707,43 @@
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k0, %k1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32500,93 +31780,78 @@
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1152:
+; NoVLX-NEXT:  .Lcfi972:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1153:
+; NoVLX-NEXT:  .Lcfi973:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1154:
+; NoVLX-NEXT:  .Lcfi974:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1155:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1156:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1157:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1158:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1159:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32595,12 +31860,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -32623,93 +31883,78 @@
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1160:
+; NoVLX-NEXT:  .Lcfi975:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1161:
+; NoVLX-NEXT:  .Lcfi976:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1162:
+; NoVLX-NEXT:  .Lcfi977:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1163:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1164:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1165:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1166:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1167:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32718,12 +31963,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -32748,94 +31988,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1168:
+; NoVLX-NEXT:  .Lcfi978:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1169:
+; NoVLX-NEXT:  .Lcfi979:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1170:
+; NoVLX-NEXT:  .Lcfi980:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1171:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1172:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1173:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1174:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1175:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32844,12 +32069,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -32875,94 +32095,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1176:
+; NoVLX-NEXT:  .Lcfi981:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1177:
+; NoVLX-NEXT:  .Lcfi982:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1178:
+; NoVLX-NEXT:  .Lcfi983:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1179:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1180:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1181:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1182:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1183:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32971,12 +32176,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -33004,94 +32204,79 @@
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1184:
+; NoVLX-NEXT:  .Lcfi984:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1185:
+; NoVLX-NEXT:  .Lcfi985:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1186:
+; NoVLX-NEXT:  .Lcfi986:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1187:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1188:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1189:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1190:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1191:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %zmm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -33100,12 +32285,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -33132,30 +32312,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1192:
+; NoVLX-NEXT:  .Lcfi987:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1193:
+; NoVLX-NEXT:  .Lcfi988:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1194:
+; NoVLX-NEXT:  .Lcfi989:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1195:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1196:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1197:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1198:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1199:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %zmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
@@ -33163,64 +32328,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -33229,12 +32394,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -33262,12 +32422,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1200:
+; NoVLX-NEXT:  .Lcfi990:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1201:
+; NoVLX-NEXT:  .Lcfi991:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1202:
+; NoVLX-NEXT:  .Lcfi992:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33276,21 +32436,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1203:
+; NoVLX-NEXT:  .Lcfi993:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1204:
+; NoVLX-NEXT:  .Lcfi994:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1205:
+; NoVLX-NEXT:  .Lcfi995:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1206:
+; NoVLX-NEXT:  .Lcfi996:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1207:
+; NoVLX-NEXT:  .Lcfi997:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33333,11 +32489,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33349,11 +32505,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33390,12 +32550,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1208:
+; NoVLX-NEXT:  .Lcfi998:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1209:
+; NoVLX-NEXT:  .Lcfi999:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1210:
+; NoVLX-NEXT:  .Lcfi1000:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33404,21 +32564,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1211:
+; NoVLX-NEXT:  .Lcfi1001:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1212:
+; NoVLX-NEXT:  .Lcfi1002:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1213:
+; NoVLX-NEXT:  .Lcfi1003:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1214:
+; NoVLX-NEXT:  .Lcfi1004:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1215:
+; NoVLX-NEXT:  .Lcfi1005:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33461,11 +32617,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33477,11 +32633,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33520,12 +32680,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1216:
+; NoVLX-NEXT:  .Lcfi1006:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1217:
+; NoVLX-NEXT:  .Lcfi1007:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1218:
+; NoVLX-NEXT:  .Lcfi1008:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33534,22 +32694,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1219:
+; NoVLX-NEXT:  .Lcfi1009:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1220:
+; NoVLX-NEXT:  .Lcfi1010:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1221:
+; NoVLX-NEXT:  .Lcfi1011:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1222:
+; NoVLX-NEXT:  .Lcfi1012:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1223:
+; NoVLX-NEXT:  .Lcfi1013:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33592,11 +32748,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33608,11 +32764,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33652,12 +32812,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1224:
+; NoVLX-NEXT:  .Lcfi1014:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1225:
+; NoVLX-NEXT:  .Lcfi1015:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1226:
+; NoVLX-NEXT:  .Lcfi1016:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33666,22 +32826,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1227:
+; NoVLX-NEXT:  .Lcfi1017:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1228:
+; NoVLX-NEXT:  .Lcfi1018:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1229:
+; NoVLX-NEXT:  .Lcfi1019:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1230:
+; NoVLX-NEXT:  .Lcfi1020:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1231:
+; NoVLX-NEXT:  .Lcfi1021:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33724,11 +32880,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33740,11 +32896,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33786,12 +32946,12 @@
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1232:
+; NoVLX-NEXT:  .Lcfi1022:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1233:
+; NoVLX-NEXT:  .Lcfi1023:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1234:
+; NoVLX-NEXT:  .Lcfi1024:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33800,22 +32960,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1235:
+; NoVLX-NEXT:  .Lcfi1025:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1236:
+; NoVLX-NEXT:  .Lcfi1026:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1237:
+; NoVLX-NEXT:  .Lcfi1027:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1238:
+; NoVLX-NEXT:  .Lcfi1028:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1239:
+; NoVLX-NEXT:  .Lcfi1029:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %zmm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33858,11 +33014,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33874,11 +33030,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33919,12 +33079,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1240:
+; NoVLX-NEXT:  .Lcfi1030:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1241:
+; NoVLX-NEXT:  .Lcfi1031:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1242:
+; NoVLX-NEXT:  .Lcfi1032:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33933,23 +33093,19 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1243:
+; NoVLX-NEXT:  .Lcfi1033:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1244:
+; NoVLX-NEXT:  .Lcfi1034:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1245:
+; NoVLX-NEXT:  .Lcfi1035:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1246:
+; NoVLX-NEXT:  .Lcfi1036:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1247:
+; NoVLX-NEXT:  .Lcfi1037:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %zmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33992,11 +33148,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -34008,11 +33164,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -34114,7 +33274,6 @@
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -34122,9 +33281,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -34156,7 +33316,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -34164,9 +33323,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -34234,7 +33394,6 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -34242,9 +33401,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -34863,12 +34023,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1248:
+; NoVLX-NEXT:  .Lcfi1038:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1249:
+; NoVLX-NEXT:  .Lcfi1039:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1250:
+; NoVLX-NEXT:  .Lcfi1040:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -34908,12 +34068,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1251:
+; NoVLX-NEXT:  .Lcfi1041:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1252:
+; NoVLX-NEXT:  .Lcfi1042:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1253:
+; NoVLX-NEXT:  .Lcfi1043:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -34956,16 +34116,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1254:
+; NoVLX-NEXT:  .Lcfi1044:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1255:
+; NoVLX-NEXT:  .Lcfi1045:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1256:
+; NoVLX-NEXT:  .Lcfi1046:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -34973,9 +34132,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -35013,17 +34173,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1257:
+; NoVLX-NEXT:  .Lcfi1047:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1258:
+; NoVLX-NEXT:  .Lcfi1048:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1259:
+; NoVLX-NEXT:  .Lcfi1049:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35031,9 +34190,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -35073,12 +34233,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1260:
+; NoVLX-NEXT:  .Lcfi1050:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1261:
+; NoVLX-NEXT:  .Lcfi1051:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1262:
+; NoVLX-NEXT:  .Lcfi1052:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -35123,17 +34283,16 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1263:
+; NoVLX-NEXT:  .Lcfi1053:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1264:
+; NoVLX-NEXT:  .Lcfi1054:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1265:
+; NoVLX-NEXT:  .Lcfi1055:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35141,9 +34300,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -35183,12 +34343,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1266:
+; NoVLX-NEXT:  .Lcfi1056:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1267:
+; NoVLX-NEXT:  .Lcfi1057:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1268:
+; NoVLX-NEXT:  .Lcfi1058:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35197,8 +34357,8 @@
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35234,12 +34394,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1269:
+; NoVLX-NEXT:  .Lcfi1059:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1270:
+; NoVLX-NEXT:  .Lcfi1060:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1271:
+; NoVLX-NEXT:  .Lcfi1061:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35249,8 +34409,8 @@
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35288,12 +34448,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1272:
+; NoVLX-NEXT:  .Lcfi1062:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1273:
+; NoVLX-NEXT:  .Lcfi1063:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1274:
+; NoVLX-NEXT:  .Lcfi1064:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35310,8 +34470,8 @@
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35351,12 +34511,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1275:
+; NoVLX-NEXT:  .Lcfi1065:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1276:
+; NoVLX-NEXT:  .Lcfi1066:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1277:
+; NoVLX-NEXT:  .Lcfi1067:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35374,8 +34534,8 @@
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35417,12 +34577,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1278:
+; NoVLX-NEXT:  .Lcfi1068:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1279:
+; NoVLX-NEXT:  .Lcfi1069:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1280:
+; NoVLX-NEXT:  .Lcfi1070:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35432,8 +34592,8 @@
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35473,12 +34633,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1281:
+; NoVLX-NEXT:  .Lcfi1071:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1282:
+; NoVLX-NEXT:  .Lcfi1072:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1283:
+; NoVLX-NEXT:  .Lcfi1073:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35496,8 +34656,8 @@
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35560,8 +34720,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35620,8 +34780,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35664,7 +34824,6 @@
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35682,6 +34841,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -35699,8 +34859,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35746,7 +34906,6 @@
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35764,6 +34923,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -35781,8 +34941,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35847,8 +35007,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35894,7 +35054,6 @@
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35912,6 +35071,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -35929,8 +35089,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35993,8 +35153,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36052,8 +35212,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36096,7 +35256,6 @@
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36114,6 +35273,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -36130,8 +35290,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36177,7 +35337,6 @@
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36195,6 +35354,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -36211,8 +35371,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36276,8 +35436,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36323,7 +35483,6 @@
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36341,6 +35500,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -36357,8 +35517,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36402,12 +35562,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1284:
+; NoVLX-NEXT:  .Lcfi1074:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1285:
+; NoVLX-NEXT:  .Lcfi1075:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1286:
+; NoVLX-NEXT:  .Lcfi1076:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36449,12 +35609,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1287:
+; NoVLX-NEXT:  .Lcfi1077:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1288:
+; NoVLX-NEXT:  .Lcfi1078:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1289:
+; NoVLX-NEXT:  .Lcfi1079:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36499,19 +35659,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1290:
+; NoVLX-NEXT:  .Lcfi1080:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1291:
+; NoVLX-NEXT:  .Lcfi1081:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1292:
+; NoVLX-NEXT:  .Lcfi1082:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36529,6 +35688,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -36568,12 +35728,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1293:
+; NoVLX-NEXT:  .Lcfi1083:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1294:
+; NoVLX-NEXT:  .Lcfi1084:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1295:
+; NoVLX-NEXT:  .Lcfi1085:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36581,7 +35741,6 @@
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36599,6 +35758,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -36640,12 +35800,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1296:
+; NoVLX-NEXT:  .Lcfi1086:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1297:
+; NoVLX-NEXT:  .Lcfi1087:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1298:
+; NoVLX-NEXT:  .Lcfi1088:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36692,12 +35852,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1299:
+; NoVLX-NEXT:  .Lcfi1089:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1300:
+; NoVLX-NEXT:  .Lcfi1090:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1301:
+; NoVLX-NEXT:  .Lcfi1091:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36705,7 +35865,6 @@
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36723,6 +35882,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -36764,12 +35924,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1302:
+; NoVLX-NEXT:  .Lcfi1092:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1303:
+; NoVLX-NEXT:  .Lcfi1093:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1304:
+; NoVLX-NEXT:  .Lcfi1094:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -36779,8 +35939,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -36817,12 +35977,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1305:
+; NoVLX-NEXT:  .Lcfi1095:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1306:
+; NoVLX-NEXT:  .Lcfi1096:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1307:
+; NoVLX-NEXT:  .Lcfi1097:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -36833,8 +35993,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -36873,12 +36033,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1308:
+; NoVLX-NEXT:  .Lcfi1098:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1309:
+; NoVLX-NEXT:  .Lcfi1099:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1310:
+; NoVLX-NEXT:  .Lcfi1100:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -36888,7 +36048,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -36901,13 +36060,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -36948,12 +36108,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1311:
+; NoVLX-NEXT:  .Lcfi1101:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1312:
+; NoVLX-NEXT:  .Lcfi1102:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1313:
+; NoVLX-NEXT:  .Lcfi1103:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -36964,7 +36124,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -36977,13 +36136,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -37026,12 +36186,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1314:
+; NoVLX-NEXT:  .Lcfi1104:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1315:
+; NoVLX-NEXT:  .Lcfi1105:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1316:
+; NoVLX-NEXT:  .Lcfi1106:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -37042,8 +36202,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -37084,12 +36244,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1317:
+; NoVLX-NEXT:  .Lcfi1107:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1318:
+; NoVLX-NEXT:  .Lcfi1108:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1319:
+; NoVLX-NEXT:  .Lcfi1109:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -37100,7 +36260,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -37113,13 +36272,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -37330,12 +36490,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1320:
+; NoVLX-NEXT:  .Lcfi1110:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1321:
+; NoVLX-NEXT:  .Lcfi1111:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1322:
+; NoVLX-NEXT:  .Lcfi1112:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37403,12 +36563,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1323:
+; NoVLX-NEXT:  .Lcfi1113:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1324:
+; NoVLX-NEXT:  .Lcfi1114:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1325:
+; NoVLX-NEXT:  .Lcfi1115:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37478,12 +36638,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1326:
+; NoVLX-NEXT:  .Lcfi1116:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1327:
+; NoVLX-NEXT:  .Lcfi1117:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1328:
+; NoVLX-NEXT:  .Lcfi1118:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37555,12 +36715,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1329:
+; NoVLX-NEXT:  .Lcfi1119:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1330:
+; NoVLX-NEXT:  .Lcfi1120:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1331:
+; NoVLX-NEXT:  .Lcfi1121:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37634,12 +36794,12 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1332:
+; NoVLX-NEXT:  .Lcfi1122:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1333:
+; NoVLX-NEXT:  .Lcfi1123:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1334:
+; NoVLX-NEXT:  .Lcfi1124:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37712,12 +36872,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1335:
+; NoVLX-NEXT:  .Lcfi1125:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1336:
+; NoVLX-NEXT:  .Lcfi1126:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1337:
+; NoVLX-NEXT:  .Lcfi1127:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37792,53 +36952,53 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1338:
+; NoVLX-NEXT:  .Lcfi1128:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1339:
+; NoVLX-NEXT:  .Lcfi1129:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1340:
+; NoVLX-NEXT:  .Lcfi1130:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -37870,53 +37030,53 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1341:
+; NoVLX-NEXT:  .Lcfi1131:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1342:
+; NoVLX-NEXT:  .Lcfi1132:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1343:
+; NoVLX-NEXT:  .Lcfi1133:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -37950,54 +37110,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1344:
+; NoVLX-NEXT:  .Lcfi1134:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1345:
+; NoVLX-NEXT:  .Lcfi1135:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1346:
+; NoVLX-NEXT:  .Lcfi1136:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38032,54 +37192,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1347:
+; NoVLX-NEXT:  .Lcfi1137:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1348:
+; NoVLX-NEXT:  .Lcfi1138:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1349:
+; NoVLX-NEXT:  .Lcfi1139:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38116,54 +37276,54 @@
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1350:
+; NoVLX-NEXT:  .Lcfi1140:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1351:
+; NoVLX-NEXT:  .Lcfi1141:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1352:
+; NoVLX-NEXT:  .Lcfi1142:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %zmm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38199,55 +37359,55 @@
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1353:
+; NoVLX-NEXT:  .Lcfi1143:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1354:
+; NoVLX-NEXT:  .Lcfi1144:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1355:
+; NoVLX-NEXT:  .Lcfi1145:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %zmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38283,30 +37443,15 @@
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1356:
+; NoVLX-NEXT:  .Lcfi1146:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1357:
+; NoVLX-NEXT:  .Lcfi1147:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1358:
+; NoVLX-NEXT:  .Lcfi1148:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1359:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1360:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1361:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1362:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1363:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
@@ -38318,64 +37463,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38384,12 +37529,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -38411,30 +37551,15 @@
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1364:
+; NoVLX-NEXT:  .Lcfi1149:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1365:
+; NoVLX-NEXT:  .Lcfi1150:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1366:
+; NoVLX-NEXT:  .Lcfi1151:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1367:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1368:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1369:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1370:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1371:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
@@ -38446,64 +37571,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38512,12 +37637,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -38541,30 +37661,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1372:
+; NoVLX-NEXT:  .Lcfi1152:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1373:
+; NoVLX-NEXT:  .Lcfi1153:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1374:
+; NoVLX-NEXT:  .Lcfi1154:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1375:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1376:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1377:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1378:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1379:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
@@ -38577,64 +37682,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38643,12 +37748,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -38673,30 +37773,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1380:
+; NoVLX-NEXT:  .Lcfi1155:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1381:
+; NoVLX-NEXT:  .Lcfi1156:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1382:
+; NoVLX-NEXT:  .Lcfi1157:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1383:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1384:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1385:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1386:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1387:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
@@ -38709,64 +37794,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38775,12 +37860,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -38806,12 +37886,12 @@
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1388:
+; NoVLX-NEXT:  .Lcfi1158:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1389:
+; NoVLX-NEXT:  .Lcfi1159:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1390:
+; NoVLX-NEXT:  .Lcfi1160:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -38820,15 +37900,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1391:
+; NoVLX-NEXT:  .Lcfi1161:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1392:
+; NoVLX-NEXT:  .Lcfi1162:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1393:
+; NoVLX-NEXT:  .Lcfi1163:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1394:
+; NoVLX-NEXT:  .Lcfi1164:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1395:
+; NoVLX-NEXT:  .Lcfi1165:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
@@ -38837,10 +37917,6 @@
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -38883,11 +37959,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -38899,11 +37975,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38939,12 +38019,12 @@
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1396:
+; NoVLX-NEXT:  .Lcfi1166:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1397:
+; NoVLX-NEXT:  .Lcfi1167:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1398:
+; NoVLX-NEXT:  .Lcfi1168:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -38953,27 +38033,23 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1399:
+; NoVLX-NEXT:  .Lcfi1169:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1400:
+; NoVLX-NEXT:  .Lcfi1170:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1401:
+; NoVLX-NEXT:  .Lcfi1171:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1402:
+; NoVLX-NEXT:  .Lcfi1172:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1403:
+; NoVLX-NEXT:  .Lcfi1173:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -39016,11 +38092,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39032,11 +38108,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -39074,12 +38154,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1404:
+; NoVLX-NEXT:  .Lcfi1174:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1405:
+; NoVLX-NEXT:  .Lcfi1175:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1406:
+; NoVLX-NEXT:  .Lcfi1176:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -39088,15 +38168,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1407:
+; NoVLX-NEXT:  .Lcfi1177:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1408:
+; NoVLX-NEXT:  .Lcfi1178:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1409:
+; NoVLX-NEXT:  .Lcfi1179:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1410:
+; NoVLX-NEXT:  .Lcfi1180:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1411:
+; NoVLX-NEXT:  .Lcfi1181:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
@@ -39106,10 +38186,6 @@
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -39152,11 +38228,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39168,11 +38244,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -39211,12 +38291,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1412:
+; NoVLX-NEXT:  .Lcfi1182:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1413:
+; NoVLX-NEXT:  .Lcfi1183:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1414:
+; NoVLX-NEXT:  .Lcfi1184:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -39225,28 +38305,24 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1415:
+; NoVLX-NEXT:  .Lcfi1185:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1416:
+; NoVLX-NEXT:  .Lcfi1186:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1417:
+; NoVLX-NEXT:  .Lcfi1187:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1418:
+; NoVLX-NEXT:  .Lcfi1188:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1419:
+; NoVLX-NEXT:  .Lcfi1189:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -39289,11 +38365,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39305,11 +38381,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -39350,12 +38430,12 @@
 ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1420:
+; NoVLX-NEXT:  .Lcfi1190:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1421:
+; NoVLX-NEXT:  .Lcfi1191:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1422:
+; NoVLX-NEXT:  .Lcfi1192:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -39402,19 +38482,19 @@
 ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1423:
+; NoVLX-NEXT:  .Lcfi1193:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1424:
+; NoVLX-NEXT:  .Lcfi1194:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1425:
+; NoVLX-NEXT:  .Lcfi1195:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -39456,12 +38536,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1426:
+; NoVLX-NEXT:  .Lcfi1196:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1427:
+; NoVLX-NEXT:  .Lcfi1197:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1428:
+; NoVLX-NEXT:  .Lcfi1198:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -39520,12 +38600,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1429:
+; NoVLX-NEXT:  .Lcfi1199:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1430:
+; NoVLX-NEXT:  .Lcfi1200:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1431:
+; NoVLX-NEXT:  .Lcfi1201:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -39719,12 +38799,12 @@
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1432:
+; NoVLX-NEXT:  .Lcfi1202:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1433:
+; NoVLX-NEXT:  .Lcfi1203:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1434:
+; NoVLX-NEXT:  .Lcfi1204:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -39797,12 +38877,12 @@
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1435:
+; NoVLX-NEXT:  .Lcfi1205:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1436:
+; NoVLX-NEXT:  .Lcfi1206:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1437:
+; NoVLX-NEXT:  .Lcfi1207:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -39877,12 +38957,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1438:
+; NoVLX-NEXT:  .Lcfi1208:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1439:
+; NoVLX-NEXT:  .Lcfi1209:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1440:
+; NoVLX-NEXT:  .Lcfi1210:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -39959,12 +39039,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1441:
+; NoVLX-NEXT:  .Lcfi1211:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1442:
+; NoVLX-NEXT:  .Lcfi1212:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1443:
+; NoVLX-NEXT:  .Lcfi1213:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -40042,12 +39122,12 @@
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1444:
+; NoVLX-NEXT:  .Lcfi1214:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1445:
+; NoVLX-NEXT:  .Lcfi1215:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1446:
+; NoVLX-NEXT:  .Lcfi1216:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -40058,43 +39138,43 @@
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -40125,59 +39205,59 @@
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1447:
+; NoVLX-NEXT:  .Lcfi1217:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1448:
+; NoVLX-NEXT:  .Lcfi1218:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1449:
+; NoVLX-NEXT:  .Lcfi1219:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -40210,12 +39290,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1450:
+; NoVLX-NEXT:  .Lcfi1220:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1451:
+; NoVLX-NEXT:  .Lcfi1221:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1452:
+; NoVLX-NEXT:  .Lcfi1222:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -40227,43 +39307,43 @@
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -40297,60 +39377,60 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1453:
+; NoVLX-NEXT:  .Lcfi1223:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1454:
+; NoVLX-NEXT:  .Lcfi1224:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1455:
+; NoVLX-NEXT:  .Lcfi1225:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -40386,30 +39466,15 @@
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1456:
+; NoVLX-NEXT:  .Lcfi1226:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1457:
+; NoVLX-NEXT:  .Lcfi1227:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1458:
+; NoVLX-NEXT:  .Lcfi1228:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1459:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1460:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1461:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1462:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1463:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
@@ -40421,64 +39486,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40487,12 +39552,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -40515,30 +39575,15 @@
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1464:
+; NoVLX-NEXT:  .Lcfi1229:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1465:
+; NoVLX-NEXT:  .Lcfi1230:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1466:
+; NoVLX-NEXT:  .Lcfi1231:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1467:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1468:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1469:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1470:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1471:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
@@ -40550,64 +39595,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40616,12 +39661,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -40646,30 +39686,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1472:
+; NoVLX-NEXT:  .Lcfi1232:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1473:
+; NoVLX-NEXT:  .Lcfi1233:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1474:
+; NoVLX-NEXT:  .Lcfi1234:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1475:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1476:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1477:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1478:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1479:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
@@ -40682,64 +39707,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40748,12 +39773,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -40779,30 +39799,15 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1480:
+; NoVLX-NEXT:  .Lcfi1235:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1481:
+; NoVLX-NEXT:  .Lcfi1236:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1482:
+; NoVLX-NEXT:  .Lcfi1237:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1483:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1484:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1485:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1486:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1487:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
@@ -40815,64 +39820,64 @@
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40881,12 +39886,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -40913,12 +39913,12 @@
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1488:
+; NoVLX-NEXT:  .Lcfi1238:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1489:
+; NoVLX-NEXT:  .Lcfi1239:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1490:
+; NoVLX-NEXT:  .Lcfi1240:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -40927,15 +39927,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1491:
+; NoVLX-NEXT:  .Lcfi1241:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1492:
+; NoVLX-NEXT:  .Lcfi1242:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1493:
+; NoVLX-NEXT:  .Lcfi1243:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1494:
+; NoVLX-NEXT:  .Lcfi1244:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1495:
+; NoVLX-NEXT:  .Lcfi1245:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -40944,10 +39944,6 @@
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -40990,11 +39986,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41006,11 +40002,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41047,12 +40047,12 @@
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1496:
+; NoVLX-NEXT:  .Lcfi1246:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1497:
+; NoVLX-NEXT:  .Lcfi1247:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1498:
+; NoVLX-NEXT:  .Lcfi1248:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -41061,27 +40061,23 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1499:
+; NoVLX-NEXT:  .Lcfi1249:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1500:
+; NoVLX-NEXT:  .Lcfi1250:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1501:
+; NoVLX-NEXT:  .Lcfi1251:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1502:
+; NoVLX-NEXT:  .Lcfi1252:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1503:
+; NoVLX-NEXT:  .Lcfi1253:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -41124,11 +40120,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41140,11 +40136,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41183,12 +40183,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1504:
+; NoVLX-NEXT:  .Lcfi1254:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1505:
+; NoVLX-NEXT:  .Lcfi1255:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1506:
+; NoVLX-NEXT:  .Lcfi1256:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -41197,15 +40197,15 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1507:
+; NoVLX-NEXT:  .Lcfi1257:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1508:
+; NoVLX-NEXT:  .Lcfi1258:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1509:
+; NoVLX-NEXT:  .Lcfi1259:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1510:
+; NoVLX-NEXT:  .Lcfi1260:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1511:
+; NoVLX-NEXT:  .Lcfi1261:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -41215,10 +40215,6 @@
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -41261,11 +40257,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41277,11 +40273,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41321,12 +40321,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1512:
+; NoVLX-NEXT:  .Lcfi1262:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1513:
+; NoVLX-NEXT:  .Lcfi1263:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1514:
+; NoVLX-NEXT:  .Lcfi1264:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -41335,28 +40335,24 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1515:
+; NoVLX-NEXT:  .Lcfi1265:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1516:
+; NoVLX-NEXT:  .Lcfi1266:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1517:
+; NoVLX-NEXT:  .Lcfi1267:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1518:
+; NoVLX-NEXT:  .Lcfi1268:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1519:
+; NoVLX-NEXT:  .Lcfi1269:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -41399,11 +40395,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41415,11 +40411,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41460,32 +40460,47 @@
 ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1520:
+; NoVLX-NEXT:  .Lcfi1270:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1521:
+; NoVLX-NEXT:  .Lcfi1271:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1522:
+; NoVLX-NEXT:  .Lcfi1272:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -41493,9 +40508,31 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm9
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
@@ -41513,29 +40550,41 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vmovd %eax, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
 ; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -41551,171 +40600,122 @@
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rax
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm3, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm2
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor %ymm1, %ymm4, %ymm3
+; NoVLX-NEXT:    vpxor %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm6, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm4
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm2, %ymm3, %ymm3
-; NoVLX-NEXT:    vpxor %ymm2, %ymm4, %ymm4
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm4, %ymm3
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm3
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpmovsxbd %xmm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpxor %ymm1, %ymm3, %ymm2
+; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41816,177 +40816,177 @@
 ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1523:
+; NoVLX-NEXT:  .Lcfi1273:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1524:
+; NoVLX-NEXT:  .Lcfi1274:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1525:
+; NoVLX-NEXT:  .Lcfi1275:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
-; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $6, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rdx
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    movl %edx, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rdx, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rdx
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm0 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor %ymm0, %ymm3, %ymm3
+; NoVLX-NEXT:    vpxor 32(%rdi), %ymm0, %ymm4
+; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm4, %ymm3
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm2
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm1, %ymm2, %ymm2
-; NoVLX-NEXT:    vpxor 32(%rdi), %ymm1, %ymm3
-; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
 ; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpxor %ymm0, %ymm1, %ymm1
+; NoVLX-NEXT:    vpxor (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -42089,12 +41089,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1526:
+; NoVLX-NEXT:  .Lcfi1276:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1527:
+; NoVLX-NEXT:  .Lcfi1277:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1528:
+; NoVLX-NEXT:  .Lcfi1278:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -42105,17 +41105,12 @@
 ; NoVLX-NEXT:    movq %rax, %rdx
 ; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -42123,9 +41118,10 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm4
@@ -42143,89 +41139,10 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm9
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
@@ -42233,9 +41150,50 @@
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
@@ -42243,26 +41201,39 @@
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm6
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm6
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm8
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm4
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -42270,15 +41241,109 @@
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm5, %ymm6, %ymm3
-; NoVLX-NEXT:    vpxor %ymm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm2
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm4, %ymm5
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; NoVLX-NEXT:    vpxor %ymm4, %ymm5, %ymm5
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm5, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpxor %ymm4, %ymm2, %ymm2
+; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
@@ -42337,16 +41402,230 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <32 x i16>
+  %1 = bitcast <8 x i64> %__b to <32 x i16>
+  %2 = icmp ult <32 x i16> %0, %1
+  %3 = bitcast i32 %__u to <32 x i1>
+  %4 = and <32 x i1> %2, %3
+  %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpltuw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1279:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1280:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1281:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm4
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm1
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm3 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpxor (%rsi), %ymm3, %ymm4
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm4, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm3
-; NoVLX-NEXT:    vpxor %ymm5, %ymm8, %ymm2
-; NoVLX-NEXT:    vpxor %ymm5, %ymm4, %ymm4
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpxor 32(%rsi), %ymm3, %ymm4
+; NoVLX-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm4, %ymm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
@@ -42406,303 +41685,24 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm3, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
-; NoVLX-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__a to <32 x i16>
-  %1 = bitcast <8 x i64> %__b to <32 x i16>
-  %2 = icmp ult <32 x i16> %0, %1
-  %3 = bitcast i32 %__u to <32 x i1>
-  %4 = and <32 x i1> %2, %3
-  %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-  %6 = bitcast <64 x i1> %5 to i64
-  ret i64 %6
-}
-
-define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    kmovd %edi, %k1
-; VLX-NEXT:    vpcmpltuw (%rsi), %zmm0, %k0 {%k1}
-; VLX-NEXT:    kmovq %k0, %rax
-; VLX-NEXT:    vzeroupper
-; VLX-NEXT:    retq
-;
-; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
-; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1529:
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1530:
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1531:
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; NoVLX-NEXT:    vmovq %xmm1, %rax
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm4
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm6
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm3
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm6, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm4, %ymm2, %ymm2
-; NoVLX-NEXT:    vpxor (%rsi), %ymm4, %ymm5
-; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm5, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %eax, %xmm2
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT:    vpxor 32(%rsi), %ymm4, %ymm4
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm4, %ymm3
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -42760,8 +41760,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -42799,9 +41799,9 @@
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -42818,8 +41818,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -42861,7 +41861,6 @@
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -42873,13 +41872,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -42896,8 +41896,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -42941,7 +41941,6 @@
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -42953,13 +41952,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -42976,8 +41976,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -43040,8 +42040,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -43085,7 +42085,6 @@
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -43097,13 +42096,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43120,8 +42120,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -43183,8 +42183,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43222,9 +42222,9 @@
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43240,8 +42240,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43283,7 +42283,6 @@
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -43295,13 +42294,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43317,8 +42317,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43362,7 +42362,6 @@
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -43374,13 +42373,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43396,8 +42396,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43459,8 +42459,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43504,7 +42504,6 @@
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -43516,13 +42515,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43538,8 +42538,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43582,12 +42582,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1532:
+; NoVLX-NEXT:  .Lcfi1282:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1533:
+; NoVLX-NEXT:  .Lcfi1283:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1534:
+; NoVLX-NEXT:  .Lcfi1284:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -43628,19 +42628,19 @@
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1535:
+; NoVLX-NEXT:  .Lcfi1285:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1536:
+; NoVLX-NEXT:  .Lcfi1286:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1537:
+; NoVLX-NEXT:  .Lcfi1287:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -43676,19 +42676,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1538:
+; NoVLX-NEXT:  .Lcfi1288:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1539:
+; NoVLX-NEXT:  .Lcfi1289:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1540:
+; NoVLX-NEXT:  .Lcfi1290:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -43700,13 +42699,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -43744,19 +42744,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1541:
+; NoVLX-NEXT:  .Lcfi1291:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1542:
+; NoVLX-NEXT:  .Lcfi1292:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1543:
+; NoVLX-NEXT:  .Lcfi1293:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -43768,13 +42767,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -43813,12 +42813,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1544:
+; NoVLX-NEXT:  .Lcfi1294:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1545:
+; NoVLX-NEXT:  .Lcfi1295:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1546:
+; NoVLX-NEXT:  .Lcfi1296:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -43863,12 +42863,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1547:
+; NoVLX-NEXT:  .Lcfi1297:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1548:
+; NoVLX-NEXT:  .Lcfi1298:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1549:
+; NoVLX-NEXT:  .Lcfi1299:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -43876,7 +42876,6 @@
 ; NoVLX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -43888,13 +42887,14 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -43934,12 +42934,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1550:
+; NoVLX-NEXT:  .Lcfi1300:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1551:
+; NoVLX-NEXT:  .Lcfi1301:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1552:
+; NoVLX-NEXT:  .Lcfi1302:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -43949,8 +42949,8 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -43986,12 +42986,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1553:
+; NoVLX-NEXT:  .Lcfi1303:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1554:
+; NoVLX-NEXT:  .Lcfi1304:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1555:
+; NoVLX-NEXT:  .Lcfi1305:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -44001,8 +43001,8 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -44040,12 +43040,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1556:
+; NoVLX-NEXT:  .Lcfi1306:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1557:
+; NoVLX-NEXT:  .Lcfi1307:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1558:
+; NoVLX-NEXT:  .Lcfi1308:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -44055,7 +43055,6 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -44068,13 +43067,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -44114,12 +43114,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1559:
+; NoVLX-NEXT:  .Lcfi1309:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1560:
+; NoVLX-NEXT:  .Lcfi1310:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1561:
+; NoVLX-NEXT:  .Lcfi1311:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -44129,7 +43129,6 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -44142,13 +43141,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -44189,12 +43189,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1562:
+; NoVLX-NEXT:  .Lcfi1312:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1563:
+; NoVLX-NEXT:  .Lcfi1313:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1564:
+; NoVLX-NEXT:  .Lcfi1314:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -44205,8 +43205,8 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -44245,12 +43245,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1565:
+; NoVLX-NEXT:  .Lcfi1315:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1566:
+; NoVLX-NEXT:  .Lcfi1316:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1567:
+; NoVLX-NEXT:  .Lcfi1317:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -44261,7 +43261,6 @@
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -44274,13 +43273,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -44511,12 +43511,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1568:
+; NoVLX-NEXT:  .Lcfi1318:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1569:
+; NoVLX-NEXT:  .Lcfi1319:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1570:
+; NoVLX-NEXT:  .Lcfi1320:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -44586,12 +43586,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1571:
+; NoVLX-NEXT:  .Lcfi1321:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1572:
+; NoVLX-NEXT:  .Lcfi1322:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1573:
+; NoVLX-NEXT:  .Lcfi1323:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -44663,12 +43663,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1574:
+; NoVLX-NEXT:  .Lcfi1324:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1575:
+; NoVLX-NEXT:  .Lcfi1325:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1576:
+; NoVLX-NEXT:  .Lcfi1326:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -44743,12 +43743,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1577:
+; NoVLX-NEXT:  .Lcfi1327:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1578:
+; NoVLX-NEXT:  .Lcfi1328:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1579:
+; NoVLX-NEXT:  .Lcfi1329:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -44824,12 +43824,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1580:
+; NoVLX-NEXT:  .Lcfi1330:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1581:
+; NoVLX-NEXT:  .Lcfi1331:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1582:
+; NoVLX-NEXT:  .Lcfi1332:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -44902,12 +43902,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1583:
+; NoVLX-NEXT:  .Lcfi1333:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1584:
+; NoVLX-NEXT:  .Lcfi1334:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1585:
+; NoVLX-NEXT:  .Lcfi1335:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -44984,55 +43984,55 @@
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1586:
+; NoVLX-NEXT:  .Lcfi1336:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1587:
+; NoVLX-NEXT:  .Lcfi1337:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1588:
+; NoVLX-NEXT:  .Lcfi1338:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -45064,55 +44064,55 @@
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1589:
+; NoVLX-NEXT:  .Lcfi1339:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1590:
+; NoVLX-NEXT:  .Lcfi1340:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1591:
+; NoVLX-NEXT:  .Lcfi1341:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -45146,12 +44146,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1592:
+; NoVLX-NEXT:  .Lcfi1342:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1593:
+; NoVLX-NEXT:  .Lcfi1343:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1594:
+; NoVLX-NEXT:  .Lcfi1344:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -45160,43 +44160,43 @@
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -45231,12 +44231,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1595:
+; NoVLX-NEXT:  .Lcfi1345:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1596:
+; NoVLX-NEXT:  .Lcfi1346:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1597:
+; NoVLX-NEXT:  .Lcfi1347:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -45245,43 +44245,43 @@
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -45317,55 +44317,55 @@
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1598:
+; NoVLX-NEXT:  .Lcfi1348:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1599:
+; NoVLX-NEXT:  .Lcfi1349:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1600:
+; NoVLX-NEXT:  .Lcfi1350:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -45400,12 +44400,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1601:
+; NoVLX-NEXT:  .Lcfi1351:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1602:
+; NoVLX-NEXT:  .Lcfi1352:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1603:
+; NoVLX-NEXT:  .Lcfi1353:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -45414,43 +44414,43 @@
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k0, %k1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -45487,93 +44487,78 @@
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1604:
+; NoVLX-NEXT:  .Lcfi1354:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1605:
+; NoVLX-NEXT:  .Lcfi1355:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1606:
+; NoVLX-NEXT:  .Lcfi1356:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1607:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1608:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1609:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1610:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1611:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -45582,12 +44567,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -45610,93 +44590,78 @@
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1612:
+; NoVLX-NEXT:  .Lcfi1357:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1613:
+; NoVLX-NEXT:  .Lcfi1358:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1614:
+; NoVLX-NEXT:  .Lcfi1359:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1615:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1616:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1617:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1618:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1619:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpltud (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -45705,12 +44670,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -45735,94 +44695,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1620:
+; NoVLX-NEXT:  .Lcfi1360:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1621:
+; NoVLX-NEXT:  .Lcfi1361:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1622:
+; NoVLX-NEXT:  .Lcfi1362:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1623:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1624:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1625:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1626:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1627:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -45831,12 +44776,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -45862,94 +44802,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1628:
+; NoVLX-NEXT:  .Lcfi1363:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1629:
+; NoVLX-NEXT:  .Lcfi1364:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1630:
+; NoVLX-NEXT:  .Lcfi1365:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1631:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1632:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1633:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1634:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1635:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -45958,12 +44883,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -45990,93 +44910,78 @@
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1636:
+; NoVLX-NEXT:  .Lcfi1366:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1637:
+; NoVLX-NEXT:  .Lcfi1367:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1638:
+; NoVLX-NEXT:  .Lcfi1368:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1639:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1640:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1641:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1642:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1643:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpltud (%rdi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -46085,12 +44990,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -46116,94 +45016,79 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1644:
+; NoVLX-NEXT:  .Lcfi1369:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1645:
+; NoVLX-NEXT:  .Lcfi1370:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1646:
+; NoVLX-NEXT:  .Lcfi1371:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1647:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1648:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1649:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1650:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1651:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -46212,12 +45097,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -46245,12 +45125,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1652:
+; NoVLX-NEXT:  .Lcfi1372:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1653:
+; NoVLX-NEXT:  .Lcfi1373:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1654:
+; NoVLX-NEXT:  .Lcfi1374:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -46259,21 +45139,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1655:
+; NoVLX-NEXT:  .Lcfi1375:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1656:
+; NoVLX-NEXT:  .Lcfi1376:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1657:
+; NoVLX-NEXT:  .Lcfi1377:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1658:
+; NoVLX-NEXT:  .Lcfi1378:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1659:
+; NoVLX-NEXT:  .Lcfi1379:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -46316,11 +45192,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46332,11 +45208,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -46373,12 +45253,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1660:
+; NoVLX-NEXT:  .Lcfi1380:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1661:
+; NoVLX-NEXT:  .Lcfi1381:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1662:
+; NoVLX-NEXT:  .Lcfi1382:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -46387,21 +45267,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1663:
+; NoVLX-NEXT:  .Lcfi1383:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1664:
+; NoVLX-NEXT:  .Lcfi1384:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1665:
+; NoVLX-NEXT:  .Lcfi1385:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1666:
+; NoVLX-NEXT:  .Lcfi1386:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1667:
+; NoVLX-NEXT:  .Lcfi1387:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpltud (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -46444,11 +45320,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46460,11 +45336,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -46503,12 +45383,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1668:
+; NoVLX-NEXT:  .Lcfi1388:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1669:
+; NoVLX-NEXT:  .Lcfi1389:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1670:
+; NoVLX-NEXT:  .Lcfi1390:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -46517,22 +45397,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1671:
+; NoVLX-NEXT:  .Lcfi1391:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1672:
+; NoVLX-NEXT:  .Lcfi1392:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1673:
+; NoVLX-NEXT:  .Lcfi1393:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1674:
+; NoVLX-NEXT:  .Lcfi1394:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1675:
+; NoVLX-NEXT:  .Lcfi1395:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -46575,11 +45451,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46591,11 +45467,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -46635,12 +45515,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1676:
+; NoVLX-NEXT:  .Lcfi1396:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1677:
+; NoVLX-NEXT:  .Lcfi1397:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1678:
+; NoVLX-NEXT:  .Lcfi1398:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -46649,22 +45529,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1679:
+; NoVLX-NEXT:  .Lcfi1399:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1680:
+; NoVLX-NEXT:  .Lcfi1400:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1681:
+; NoVLX-NEXT:  .Lcfi1401:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1682:
+; NoVLX-NEXT:  .Lcfi1402:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1683:
+; NoVLX-NEXT:  .Lcfi1403:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -46707,11 +45583,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46723,11 +45599,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -46768,12 +45648,12 @@
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1684:
+; NoVLX-NEXT:  .Lcfi1404:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1685:
+; NoVLX-NEXT:  .Lcfi1405:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1686:
+; NoVLX-NEXT:  .Lcfi1406:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -46782,21 +45662,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1687:
+; NoVLX-NEXT:  .Lcfi1407:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1688:
+; NoVLX-NEXT:  .Lcfi1408:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1689:
+; NoVLX-NEXT:  .Lcfi1409:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1690:
+; NoVLX-NEXT:  .Lcfi1410:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1691:
+; NoVLX-NEXT:  .Lcfi1411:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpltud (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -46839,11 +45715,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46855,11 +45731,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -46899,12 +45779,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1692:
+; NoVLX-NEXT:  .Lcfi1412:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1693:
+; NoVLX-NEXT:  .Lcfi1413:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1694:
+; NoVLX-NEXT:  .Lcfi1414:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -46913,22 +45793,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1695:
+; NoVLX-NEXT:  .Lcfi1415:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1696:
+; NoVLX-NEXT:  .Lcfi1416:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1697:
+; NoVLX-NEXT:  .Lcfi1417:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1698:
+; NoVLX-NEXT:  .Lcfi1418:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1699:
+; NoVLX-NEXT:  .Lcfi1419:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -46971,11 +45847,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46987,11 +45863,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -47063,9 +45943,9 @@
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -47097,7 +45977,6 @@
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -47105,9 +45984,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -47141,7 +46021,6 @@
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -47149,9 +46028,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -47221,7 +46101,6 @@
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -47229,9 +46108,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -47407,9 +46287,9 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -47708,9 +46588,9 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -47866,12 +46746,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1700:
+; NoVLX-NEXT:  .Lcfi1420:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1701:
+; NoVLX-NEXT:  .Lcfi1421:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1702:
+; NoVLX-NEXT:  .Lcfi1422:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -47912,19 +46792,19 @@
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1703:
+; NoVLX-NEXT:  .Lcfi1423:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1704:
+; NoVLX-NEXT:  .Lcfi1424:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1705:
+; NoVLX-NEXT:  .Lcfi1425:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -47960,19 +46840,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1706:
+; NoVLX-NEXT:  .Lcfi1426:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1707:
+; NoVLX-NEXT:  .Lcfi1427:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1708:
+; NoVLX-NEXT:  .Lcfi1428:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -47980,9 +46859,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -48020,19 +46900,18 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1709:
+; NoVLX-NEXT:  .Lcfi1429:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1710:
+; NoVLX-NEXT:  .Lcfi1430:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1711:
+; NoVLX-NEXT:  .Lcfi1431:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -48040,9 +46919,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -48081,12 +46961,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1712:
+; NoVLX-NEXT:  .Lcfi1432:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1713:
+; NoVLX-NEXT:  .Lcfi1433:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1714:
+; NoVLX-NEXT:  .Lcfi1434:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -48131,12 +47011,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1715:
+; NoVLX-NEXT:  .Lcfi1435:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1716:
+; NoVLX-NEXT:  .Lcfi1436:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1717:
+; NoVLX-NEXT:  .Lcfi1437:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -48144,7 +47024,6 @@
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -48152,9 +47031,10 @@
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -48194,12 +47074,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1718:
+; NoVLX-NEXT:  .Lcfi1438:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1719:
+; NoVLX-NEXT:  .Lcfi1439:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1720:
+; NoVLX-NEXT:  .Lcfi1440:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -48209,8 +47089,8 @@
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -48246,12 +47126,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1721:
+; NoVLX-NEXT:  .Lcfi1441:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1722:
+; NoVLX-NEXT:  .Lcfi1442:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1723:
+; NoVLX-NEXT:  .Lcfi1443:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -48261,8 +47141,8 @@
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -48300,12 +47180,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1724:
+; NoVLX-NEXT:  .Lcfi1444:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1725:
+; NoVLX-NEXT:  .Lcfi1445:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1726:
+; NoVLX-NEXT:  .Lcfi1446:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -48325,8 +47205,8 @@
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -48366,12 +47246,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1727:
+; NoVLX-NEXT:  .Lcfi1447:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1728:
+; NoVLX-NEXT:  .Lcfi1448:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1729:
+; NoVLX-NEXT:  .Lcfi1449:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -48391,8 +47271,8 @@
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -48433,12 +47313,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1730:
+; NoVLX-NEXT:  .Lcfi1450:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1731:
+; NoVLX-NEXT:  .Lcfi1451:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1732:
+; NoVLX-NEXT:  .Lcfi1452:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -48449,8 +47329,8 @@
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -48489,12 +47369,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1733:
+; NoVLX-NEXT:  .Lcfi1453:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1734:
+; NoVLX-NEXT:  .Lcfi1454:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1735:
+; NoVLX-NEXT:  .Lcfi1455:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -48515,8 +47395,8 @@
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -48580,8 +47460,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48640,8 +47520,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48685,7 +47565,6 @@
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -48703,6 +47582,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -48720,8 +47600,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48764,10 +47644,9 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -48785,6 +47664,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -48802,8 +47682,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48868,8 +47748,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48915,7 +47795,6 @@
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -48933,6 +47812,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -48950,8 +47830,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -49015,8 +47895,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49074,8 +47954,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49119,7 +47999,6 @@
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -49137,6 +48016,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -49153,8 +48033,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49197,10 +48077,9 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -49218,6 +48097,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -49234,8 +48114,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49299,8 +48179,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49346,7 +48226,6 @@
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -49364,6 +48243,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -49380,8 +48260,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49425,12 +48305,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1736:
+; NoVLX-NEXT:  .Lcfi1456:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1737:
+; NoVLX-NEXT:  .Lcfi1457:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1738:
+; NoVLX-NEXT:  .Lcfi1458:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -49473,12 +48353,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1739:
+; NoVLX-NEXT:  .Lcfi1459:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1740:
+; NoVLX-NEXT:  .Lcfi1460:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1741:
+; NoVLX-NEXT:  .Lcfi1461:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -49523,12 +48403,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1742:
+; NoVLX-NEXT:  .Lcfi1462:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1743:
+; NoVLX-NEXT:  .Lcfi1463:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1744:
+; NoVLX-NEXT:  .Lcfi1464:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -49536,7 +48416,6 @@
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -49554,6 +48433,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -49593,20 +48473,19 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1745:
+; NoVLX-NEXT:  .Lcfi1465:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1746:
+; NoVLX-NEXT:  .Lcfi1466:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1747:
+; NoVLX-NEXT:  .Lcfi1467:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -49624,6 +48503,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -49664,12 +48544,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1748:
+; NoVLX-NEXT:  .Lcfi1468:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1749:
+; NoVLX-NEXT:  .Lcfi1469:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1750:
+; NoVLX-NEXT:  .Lcfi1470:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -49716,12 +48596,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1751:
+; NoVLX-NEXT:  .Lcfi1471:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1752:
+; NoVLX-NEXT:  .Lcfi1472:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1753:
+; NoVLX-NEXT:  .Lcfi1473:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -49730,7 +48610,6 @@
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -49748,6 +48627,7 @@
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -49789,12 +48669,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1754:
+; NoVLX-NEXT:  .Lcfi1474:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1755:
+; NoVLX-NEXT:  .Lcfi1475:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1756:
+; NoVLX-NEXT:  .Lcfi1476:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -49805,8 +48685,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -49843,12 +48723,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1757:
+; NoVLX-NEXT:  .Lcfi1477:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1758:
+; NoVLX-NEXT:  .Lcfi1478:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1759:
+; NoVLX-NEXT:  .Lcfi1479:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -49859,8 +48739,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -49899,12 +48779,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1760:
+; NoVLX-NEXT:  .Lcfi1480:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1761:
+; NoVLX-NEXT:  .Lcfi1481:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1762:
+; NoVLX-NEXT:  .Lcfi1482:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -49915,7 +48795,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -49928,13 +48807,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -49975,12 +48855,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1763:
+; NoVLX-NEXT:  .Lcfi1483:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1764:
+; NoVLX-NEXT:  .Lcfi1484:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1765:
+; NoVLX-NEXT:  .Lcfi1485:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -49991,7 +48871,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -50004,13 +48883,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -50052,12 +48932,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1766:
+; NoVLX-NEXT:  .Lcfi1486:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1767:
+; NoVLX-NEXT:  .Lcfi1487:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1768:
+; NoVLX-NEXT:  .Lcfi1488:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -50069,8 +48949,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -50110,12 +48990,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1769:
+; NoVLX-NEXT:  .Lcfi1489:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1770:
+; NoVLX-NEXT:  .Lcfi1490:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1771:
+; NoVLX-NEXT:  .Lcfi1491:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -50127,7 +49007,6 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -50140,13 +49019,14 @@
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -50353,12 +49233,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1772:
+; NoVLX-NEXT:  .Lcfi1492:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1773:
+; NoVLX-NEXT:  .Lcfi1493:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1774:
+; NoVLX-NEXT:  .Lcfi1494:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -50426,12 +49306,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1775:
+; NoVLX-NEXT:  .Lcfi1495:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1776:
+; NoVLX-NEXT:  .Lcfi1496:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1777:
+; NoVLX-NEXT:  .Lcfi1497:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -50501,12 +49381,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1778:
+; NoVLX-NEXT:  .Lcfi1498:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1779:
+; NoVLX-NEXT:  .Lcfi1499:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1780:
+; NoVLX-NEXT:  .Lcfi1500:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -50578,12 +49458,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1781:
+; NoVLX-NEXT:  .Lcfi1501:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1782:
+; NoVLX-NEXT:  .Lcfi1502:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1783:
+; NoVLX-NEXT:  .Lcfi1503:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -50656,12 +49536,12 @@
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1784:
+; NoVLX-NEXT:  .Lcfi1504:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1785:
+; NoVLX-NEXT:  .Lcfi1505:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1786:
+; NoVLX-NEXT:  .Lcfi1506:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -50732,12 +49612,12 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1787:
+; NoVLX-NEXT:  .Lcfi1507:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1788:
+; NoVLX-NEXT:  .Lcfi1508:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1789:
+; NoVLX-NEXT:  .Lcfi1509:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -50811,53 +49691,53 @@
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1790:
+; NoVLX-NEXT:  .Lcfi1510:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1791:
+; NoVLX-NEXT:  .Lcfi1511:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1792:
+; NoVLX-NEXT:  .Lcfi1512:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -50889,53 +49769,53 @@
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1793:
+; NoVLX-NEXT:  .Lcfi1513:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1794:
+; NoVLX-NEXT:  .Lcfi1514:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1795:
+; NoVLX-NEXT:  .Lcfi1515:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpltuq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -50969,54 +49849,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1796:
+; NoVLX-NEXT:  .Lcfi1516:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1797:
+; NoVLX-NEXT:  .Lcfi1517:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1798:
+; NoVLX-NEXT:  .Lcfi1518:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -51051,54 +49931,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1799:
+; NoVLX-NEXT:  .Lcfi1519:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1800:
+; NoVLX-NEXT:  .Lcfi1520:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1801:
+; NoVLX-NEXT:  .Lcfi1521:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -51134,53 +50014,53 @@
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1802:
+; NoVLX-NEXT:  .Lcfi1522:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1803:
+; NoVLX-NEXT:  .Lcfi1523:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1804:
+; NoVLX-NEXT:  .Lcfi1524:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -51215,54 +50095,54 @@
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1805:
+; NoVLX-NEXT:  .Lcfi1525:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1806:
+; NoVLX-NEXT:  .Lcfi1526:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1807:
+; NoVLX-NEXT:  .Lcfi1527:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -51316,8 +50196,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51371,8 +50251,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51428,8 +50308,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51493,8 +50373,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51539,8 +50419,8 @@
 ; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
 ; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -51558,8 +50438,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51625,8 +50505,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51685,8 +50565,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51739,8 +50619,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51795,8 +50675,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51859,8 +50739,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51905,8 +50785,8 @@
 ; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
 ; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -51923,8 +50803,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51989,8 +50869,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -52033,12 +50913,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1808:
+; NoVLX-NEXT:  .Lcfi1528:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1809:
+; NoVLX-NEXT:  .Lcfi1529:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1810:
+; NoVLX-NEXT:  .Lcfi1530:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -52076,12 +50956,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1811:
+; NoVLX-NEXT:  .Lcfi1531:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1812:
+; NoVLX-NEXT:  .Lcfi1532:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1813:
+; NoVLX-NEXT:  .Lcfi1533:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -52120,12 +51000,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1814:
+; NoVLX-NEXT:  .Lcfi1534:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1815:
+; NoVLX-NEXT:  .Lcfi1535:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1816:
+; NoVLX-NEXT:  .Lcfi1536:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -52168,12 +51048,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1817:
+; NoVLX-NEXT:  .Lcfi1537:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1818:
+; NoVLX-NEXT:  .Lcfi1538:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1819:
+; NoVLX-NEXT:  .Lcfi1539:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -52221,12 +51101,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1820:
+; NoVLX-NEXT:  .Lcfi1540:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1821:
+; NoVLX-NEXT:  .Lcfi1541:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1822:
+; NoVLX-NEXT:  .Lcfi1542:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -52234,8 +51114,8 @@
 ; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
 ; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -52275,12 +51155,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1823:
+; NoVLX-NEXT:  .Lcfi1543:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1824:
+; NoVLX-NEXT:  .Lcfi1544:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1825:
+; NoVLX-NEXT:  .Lcfi1545:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -52331,20 +51211,20 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1826:
+; NoVLX-NEXT:  .Lcfi1546:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1827:
+; NoVLX-NEXT:  .Lcfi1547:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1828:
+; NoVLX-NEXT:  .Lcfi1548:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -52380,20 +51260,20 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1829:
+; NoVLX-NEXT:  .Lcfi1549:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1830:
+; NoVLX-NEXT:  .Lcfi1550:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1831:
+; NoVLX-NEXT:  .Lcfi1551:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -52430,12 +51310,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1832:
+; NoVLX-NEXT:  .Lcfi1552:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1833:
+; NoVLX-NEXT:  .Lcfi1553:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1834:
+; NoVLX-NEXT:  .Lcfi1554:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -52443,8 +51323,8 @@
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -52484,12 +51364,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1835:
+; NoVLX-NEXT:  .Lcfi1555:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1836:
+; NoVLX-NEXT:  .Lcfi1556:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1837:
+; NoVLX-NEXT:  .Lcfi1557:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -52502,8 +51382,8 @@
 ; NoVLX-NEXT:    vandps %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -52543,12 +51423,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1838:
+; NoVLX-NEXT:  .Lcfi1558:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1839:
+; NoVLX-NEXT:  .Lcfi1559:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1840:
+; NoVLX-NEXT:  .Lcfi1560:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -52561,8 +51441,8 @@
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -52603,12 +51483,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1841:
+; NoVLX-NEXT:  .Lcfi1561:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1842:
+; NoVLX-NEXT:  .Lcfi1562:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1843:
+; NoVLX-NEXT:  .Lcfi1563:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -52622,8 +51502,8 @@
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -52854,12 +51734,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1844:
+; NoVLX-NEXT:  .Lcfi1564:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1845:
+; NoVLX-NEXT:  .Lcfi1565:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1846:
+; NoVLX-NEXT:  .Lcfi1566:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -52929,12 +51809,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1847:
+; NoVLX-NEXT:  .Lcfi1567:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1848:
+; NoVLX-NEXT:  .Lcfi1568:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1849:
+; NoVLX-NEXT:  .Lcfi1569:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -53005,12 +51885,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1850:
+; NoVLX-NEXT:  .Lcfi1570:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1851:
+; NoVLX-NEXT:  .Lcfi1571:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1852:
+; NoVLX-NEXT:  .Lcfi1572:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -53083,12 +51963,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1853:
+; NoVLX-NEXT:  .Lcfi1573:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1854:
+; NoVLX-NEXT:  .Lcfi1574:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1855:
+; NoVLX-NEXT:  .Lcfi1575:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -53163,12 +52043,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1856:
+; NoVLX-NEXT:  .Lcfi1576:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1857:
+; NoVLX-NEXT:  .Lcfi1577:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1858:
+; NoVLX-NEXT:  .Lcfi1578:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -53244,12 +52124,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1859:
+; NoVLX-NEXT:  .Lcfi1579:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1860:
+; NoVLX-NEXT:  .Lcfi1580:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1861:
+; NoVLX-NEXT:  .Lcfi1581:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -53327,55 +52207,55 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1862:
+; NoVLX-NEXT:  .Lcfi1582:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1863:
+; NoVLX-NEXT:  .Lcfi1583:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1864:
+; NoVLX-NEXT:  .Lcfi1584:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -53407,55 +52287,55 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1865:
+; NoVLX-NEXT:  .Lcfi1585:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1866:
+; NoVLX-NEXT:  .Lcfi1586:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1867:
+; NoVLX-NEXT:  .Lcfi1587:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -53488,55 +52368,55 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1868:
+; NoVLX-NEXT:  .Lcfi1588:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1869:
+; NoVLX-NEXT:  .Lcfi1589:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1870:
+; NoVLX-NEXT:  .Lcfi1590:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -53571,12 +52451,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1871:
+; NoVLX-NEXT:  .Lcfi1591:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1872:
+; NoVLX-NEXT:  .Lcfi1592:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1873:
+; NoVLX-NEXT:  .Lcfi1593:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -53585,43 +52465,43 @@
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -53656,12 +52536,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1874:
+; NoVLX-NEXT:  .Lcfi1594:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1875:
+; NoVLX-NEXT:  .Lcfi1595:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1876:
+; NoVLX-NEXT:  .Lcfi1596:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -53670,43 +52550,43 @@
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -53742,12 +52622,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1877:
+; NoVLX-NEXT:  .Lcfi1597:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1878:
+; NoVLX-NEXT:  .Lcfi1598:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1879:
+; NoVLX-NEXT:  .Lcfi1599:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -53756,43 +52636,43 @@
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -53830,93 +52710,78 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1880:
+; NoVLX-NEXT:  .Lcfi1600:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1881:
+; NoVLX-NEXT:  .Lcfi1601:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1882:
+; NoVLX-NEXT:  .Lcfi1602:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1883:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1884:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1885:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1886:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1887:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -53925,12 +52790,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -53953,93 +52813,78 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1888:
+; NoVLX-NEXT:  .Lcfi1603:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1889:
+; NoVLX-NEXT:  .Lcfi1604:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1890:
+; NoVLX-NEXT:  .Lcfi1605:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1891:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1892:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1893:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1894:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1895:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54048,12 +52893,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -54077,93 +52917,78 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1896:
+; NoVLX-NEXT:  .Lcfi1606:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1897:
+; NoVLX-NEXT:  .Lcfi1607:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1898:
+; NoVLX-NEXT:  .Lcfi1608:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1899:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1900:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1901:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1902:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1903:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54172,12 +52997,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -54203,94 +53023,79 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1904:
+; NoVLX-NEXT:  .Lcfi1609:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1905:
+; NoVLX-NEXT:  .Lcfi1610:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1906:
+; NoVLX-NEXT:  .Lcfi1611:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1907:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1908:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1909:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1910:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1911:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54299,12 +53104,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -54330,94 +53130,79 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1912:
+; NoVLX-NEXT:  .Lcfi1612:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1913:
+; NoVLX-NEXT:  .Lcfi1613:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1914:
+; NoVLX-NEXT:  .Lcfi1614:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1915:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1916:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1917:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1918:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1919:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54426,12 +53211,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -54458,94 +53238,79 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1920:
+; NoVLX-NEXT:  .Lcfi1615:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1921:
+; NoVLX-NEXT:  .Lcfi1616:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1922:
+; NoVLX-NEXT:  .Lcfi1617:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1923:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1924:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1925:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1926:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1927:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54554,12 +53319,7 @@
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -54634,12 +53394,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1928:
+; NoVLX-NEXT:  .Lcfi1618:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1929:
+; NoVLX-NEXT:  .Lcfi1619:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1930:
+; NoVLX-NEXT:  .Lcfi1620:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -54648,21 +53408,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1931:
+; NoVLX-NEXT:  .Lcfi1621:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1932:
+; NoVLX-NEXT:  .Lcfi1622:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1933:
+; NoVLX-NEXT:  .Lcfi1623:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1934:
+; NoVLX-NEXT:  .Lcfi1624:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1935:
+; NoVLX-NEXT:  .Lcfi1625:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -54705,11 +53461,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -54721,11 +53477,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -54762,12 +53522,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1936:
+; NoVLX-NEXT:  .Lcfi1626:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1937:
+; NoVLX-NEXT:  .Lcfi1627:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1938:
+; NoVLX-NEXT:  .Lcfi1628:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -54776,21 +53536,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1939:
+; NoVLX-NEXT:  .Lcfi1629:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1940:
+; NoVLX-NEXT:  .Lcfi1630:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1941:
+; NoVLX-NEXT:  .Lcfi1631:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1942:
+; NoVLX-NEXT:  .Lcfi1632:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1943:
+; NoVLX-NEXT:  .Lcfi1633:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -54833,11 +53589,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -54849,11 +53605,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -54891,12 +53651,12 @@
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1944:
+; NoVLX-NEXT:  .Lcfi1634:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1945:
+; NoVLX-NEXT:  .Lcfi1635:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1946:
+; NoVLX-NEXT:  .Lcfi1636:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -54905,21 +53665,17 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1947:
+; NoVLX-NEXT:  .Lcfi1637:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1948:
+; NoVLX-NEXT:  .Lcfi1638:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1949:
+; NoVLX-NEXT:  .Lcfi1639:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1950:
+; NoVLX-NEXT:  .Lcfi1640:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1951:
+; NoVLX-NEXT:  .Lcfi1641:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -54962,11 +53718,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -54978,11 +53734,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -55022,12 +53782,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1952:
+; NoVLX-NEXT:  .Lcfi1642:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1953:
+; NoVLX-NEXT:  .Lcfi1643:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1954:
+; NoVLX-NEXT:  .Lcfi1644:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -55036,22 +53796,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1955:
+; NoVLX-NEXT:  .Lcfi1645:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1956:
+; NoVLX-NEXT:  .Lcfi1646:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1957:
+; NoVLX-NEXT:  .Lcfi1647:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1958:
+; NoVLX-NEXT:  .Lcfi1648:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1959:
+; NoVLX-NEXT:  .Lcfi1649:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -55094,11 +53850,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -55110,11 +53866,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -55154,12 +53914,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1960:
+; NoVLX-NEXT:  .Lcfi1650:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1961:
+; NoVLX-NEXT:  .Lcfi1651:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1962:
+; NoVLX-NEXT:  .Lcfi1652:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -55168,22 +53928,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1963:
+; NoVLX-NEXT:  .Lcfi1653:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1964:
+; NoVLX-NEXT:  .Lcfi1654:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1965:
+; NoVLX-NEXT:  .Lcfi1655:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1966:
+; NoVLX-NEXT:  .Lcfi1656:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1967:
+; NoVLX-NEXT:  .Lcfi1657:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -55226,11 +53982,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -55242,11 +53998,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -55287,12 +54047,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1968:
+; NoVLX-NEXT:  .Lcfi1658:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1969:
+; NoVLX-NEXT:  .Lcfi1659:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1970:
+; NoVLX-NEXT:  .Lcfi1660:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -55301,22 +54061,18 @@
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1971:
+; NoVLX-NEXT:  .Lcfi1661:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1972:
+; NoVLX-NEXT:  .Lcfi1662:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1973:
+; NoVLX-NEXT:  .Lcfi1663:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1974:
+; NoVLX-NEXT:  .Lcfi1664:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1975:
+; NoVLX-NEXT:  .Lcfi1665:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -55359,11 +54115,11 @@
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -55375,11 +54131,15 @@
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -55598,8 +54358,8 @@
 ; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
@@ -56207,12 +54967,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1976:
+; NoVLX-NEXT:  .Lcfi1666:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1977:
+; NoVLX-NEXT:  .Lcfi1667:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1978:
+; NoVLX-NEXT:  .Lcfi1668:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -56250,12 +55010,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1979:
+; NoVLX-NEXT:  .Lcfi1669:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1980:
+; NoVLX-NEXT:  .Lcfi1670:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1981:
+; NoVLX-NEXT:  .Lcfi1671:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -56294,12 +55054,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1982:
+; NoVLX-NEXT:  .Lcfi1672:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1983:
+; NoVLX-NEXT:  .Lcfi1673:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1984:
+; NoVLX-NEXT:  .Lcfi1674:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -56342,12 +55102,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1985:
+; NoVLX-NEXT:  .Lcfi1675:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1986:
+; NoVLX-NEXT:  .Lcfi1676:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1987:
+; NoVLX-NEXT:  .Lcfi1677:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -56394,20 +55154,20 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1988:
+; NoVLX-NEXT:  .Lcfi1678:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1989:
+; NoVLX-NEXT:  .Lcfi1679:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1990:
+; NoVLX-NEXT:  .Lcfi1680:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -56447,12 +55207,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1991:
+; NoVLX-NEXT:  .Lcfi1681:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1992:
+; NoVLX-NEXT:  .Lcfi1682:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1993:
+; NoVLX-NEXT:  .Lcfi1683:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -56502,20 +55262,20 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1994:
+; NoVLX-NEXT:  .Lcfi1684:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1995:
+; NoVLX-NEXT:  .Lcfi1685:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1996:
+; NoVLX-NEXT:  .Lcfi1686:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -56551,20 +55311,20 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1997:
+; NoVLX-NEXT:  .Lcfi1687:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1998:
+; NoVLX-NEXT:  .Lcfi1688:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1999:
+; NoVLX-NEXT:  .Lcfi1689:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -56601,12 +55361,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2000:
+; NoVLX-NEXT:  .Lcfi1690:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2001:
+; NoVLX-NEXT:  .Lcfi1691:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2002:
+; NoVLX-NEXT:  .Lcfi1692:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -56614,8 +55374,8 @@
 ; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -56655,12 +55415,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2003:
+; NoVLX-NEXT:  .Lcfi1693:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2004:
+; NoVLX-NEXT:  .Lcfi1694:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2005:
+; NoVLX-NEXT:  .Lcfi1695:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -56672,8 +55432,8 @@
 ; NoVLX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -56713,12 +55473,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2006:
+; NoVLX-NEXT:  .Lcfi1696:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2007:
+; NoVLX-NEXT:  .Lcfi1697:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2008:
+; NoVLX-NEXT:  .Lcfi1698:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -56730,8 +55490,8 @@
 ; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -56772,12 +55532,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2009:
+; NoVLX-NEXT:  .Lcfi1699:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2010:
+; NoVLX-NEXT:  .Lcfi1700:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2011:
+; NoVLX-NEXT:  .Lcfi1701:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -56790,8 +55550,8 @@
 ; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -56852,8 +55612,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -56909,8 +55669,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -56968,8 +55728,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -57035,8 +55795,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -57102,8 +55862,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -57171,8 +55931,8 @@
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -57233,8 +55993,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57289,8 +56049,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57347,8 +56107,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57413,8 +56173,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57479,8 +56239,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57547,8 +56307,8 @@
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57592,12 +56352,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2012:
+; NoVLX-NEXT:  .Lcfi1702:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2013:
+; NoVLX-NEXT:  .Lcfi1703:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2014:
+; NoVLX-NEXT:  .Lcfi1704:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -57637,12 +56397,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2015:
+; NoVLX-NEXT:  .Lcfi1705:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2016:
+; NoVLX-NEXT:  .Lcfi1706:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2017:
+; NoVLX-NEXT:  .Lcfi1707:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -57683,12 +56443,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2018:
+; NoVLX-NEXT:  .Lcfi1708:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2019:
+; NoVLX-NEXT:  .Lcfi1709:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2020:
+; NoVLX-NEXT:  .Lcfi1710:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -57733,12 +56493,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2021:
+; NoVLX-NEXT:  .Lcfi1711:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2022:
+; NoVLX-NEXT:  .Lcfi1712:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2023:
+; NoVLX-NEXT:  .Lcfi1713:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -57788,12 +56548,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2024:
+; NoVLX-NEXT:  .Lcfi1714:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2025:
+; NoVLX-NEXT:  .Lcfi1715:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2026:
+; NoVLX-NEXT:  .Lcfi1716:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -57844,12 +56604,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2027:
+; NoVLX-NEXT:  .Lcfi1717:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2028:
+; NoVLX-NEXT:  .Lcfi1718:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2029:
+; NoVLX-NEXT:  .Lcfi1719:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -57902,12 +56662,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2030:
+; NoVLX-NEXT:  .Lcfi1720:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2031:
+; NoVLX-NEXT:  .Lcfi1721:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2032:
+; NoVLX-NEXT:  .Lcfi1722:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -57915,8 +56675,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -57953,12 +56713,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2033:
+; NoVLX-NEXT:  .Lcfi1723:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2034:
+; NoVLX-NEXT:  .Lcfi1724:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2035:
+; NoVLX-NEXT:  .Lcfi1725:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -57966,8 +56726,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -58005,12 +56765,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2036:
+; NoVLX-NEXT:  .Lcfi1726:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2037:
+; NoVLX-NEXT:  .Lcfi1727:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2038:
+; NoVLX-NEXT:  .Lcfi1728:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -58019,8 +56779,8 @@
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -58061,12 +56821,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2039:
+; NoVLX-NEXT:  .Lcfi1729:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2040:
+; NoVLX-NEXT:  .Lcfi1730:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2041:
+; NoVLX-NEXT:  .Lcfi1731:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -58080,8 +56840,8 @@
 ; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -58122,12 +56882,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2042:
+; NoVLX-NEXT:  .Lcfi1732:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2043:
+; NoVLX-NEXT:  .Lcfi1733:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2044:
+; NoVLX-NEXT:  .Lcfi1734:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -58141,8 +56901,8 @@
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -58184,12 +56944,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2045:
+; NoVLX-NEXT:  .Lcfi1735:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2046:
+; NoVLX-NEXT:  .Lcfi1736:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2047:
+; NoVLX-NEXT:  .Lcfi1737:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -58204,8 +56964,8 @@
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -58466,12 +57226,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2048:
+; NoVLX-NEXT:  .Lcfi1738:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2049:
+; NoVLX-NEXT:  .Lcfi1739:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2050:
+; NoVLX-NEXT:  .Lcfi1740:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -58539,12 +57299,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2051:
+; NoVLX-NEXT:  .Lcfi1741:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2052:
+; NoVLX-NEXT:  .Lcfi1742:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2053:
+; NoVLX-NEXT:  .Lcfi1743:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -58613,12 +57373,12 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2054:
+; NoVLX-NEXT:  .Lcfi1744:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2055:
+; NoVLX-NEXT:  .Lcfi1745:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2056:
+; NoVLX-NEXT:  .Lcfi1746:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -58689,12 +57449,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2057:
+; NoVLX-NEXT:  .Lcfi1747:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2058:
+; NoVLX-NEXT:  .Lcfi1748:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2059:
+; NoVLX-NEXT:  .Lcfi1749:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -58766,12 +57526,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2060:
+; NoVLX-NEXT:  .Lcfi1750:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2061:
+; NoVLX-NEXT:  .Lcfi1751:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2062:
+; NoVLX-NEXT:  .Lcfi1752:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -58844,12 +57604,12 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2063:
+; NoVLX-NEXT:  .Lcfi1753:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2064:
+; NoVLX-NEXT:  .Lcfi1754:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2065:
+; NoVLX-NEXT:  .Lcfi1755:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -58972,53 +57732,53 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2066:
+; NoVLX-NEXT:  .Lcfi1756:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2067:
+; NoVLX-NEXT:  .Lcfi1757:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2068:
+; NoVLX-NEXT:  .Lcfi1758:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -59050,53 +57810,53 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2069:
+; NoVLX-NEXT:  .Lcfi1759:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2070:
+; NoVLX-NEXT:  .Lcfi1760:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2071:
+; NoVLX-NEXT:  .Lcfi1761:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -59129,53 +57889,53 @@
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2072:
+; NoVLX-NEXT:  .Lcfi1762:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2073:
+; NoVLX-NEXT:  .Lcfi1763:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2074:
+; NoVLX-NEXT:  .Lcfi1764:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -59210,54 +57970,54 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2075:
+; NoVLX-NEXT:  .Lcfi1765:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2076:
+; NoVLX-NEXT:  .Lcfi1766:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2077:
+; NoVLX-NEXT:  .Lcfi1767:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -59292,54 +58052,54 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2078:
+; NoVLX-NEXT:  .Lcfi1768:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2079:
+; NoVLX-NEXT:  .Lcfi1769:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2080:
+; NoVLX-NEXT:  .Lcfi1770:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -59375,54 +58135,54 @@
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi2081:
+; NoVLX-NEXT:  .Lcfi1771:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi2082:
+; NoVLX-NEXT:  .Lcfi1772:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi2083:
+; NoVLX-NEXT:  .Lcfi1773:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/bmi-schedule.ll b/llvm/test/CodeGen/X86/bmi-schedule.ll
index 1041271..79f906f 100644
--- a/llvm/test/CodeGen/X86/bmi-schedule.ll
+++ b/llvm/test/CodeGen/X86/bmi-schedule.ll
@@ -20,10 +20,10 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    andnl %esi, %edi, %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    notl %edi # sched: [1:0.25]
-; HASWELL-NEXT:    andw (%rdx), %di # sched: [5:0.50]
+; HASWELL-NEXT:    andw (%rdx), %di # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %edi, %eax # sched: [1:0.25]
 ; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andn_i16:
 ; BTVER2:       # BB#0:
@@ -61,9 +61,9 @@
 ; HASWELL-LABEL: test_andn_i32:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    andnl (%rdx), %edi, %eax # sched: [4:0.50]
+; HASWELL-NEXT:    andnl (%rdx), %edi, %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andn_i32:
 ; BTVER2:       # BB#0:
@@ -97,9 +97,9 @@
 ; HASWELL-LABEL: test_andn_i64:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    andnq (%rdx), %rdi, %rax # sched: [4:0.50]
+; HASWELL-NEXT:    andnq (%rdx), %rdi, %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andn_i64:
 ; BTVER2:       # BB#0:
@@ -132,10 +132,10 @@
 ;
 ; HASWELL-LABEL: test_bextr_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [6:0.50]
+; HASWELL-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [2:0.50]
 ; HASWELL-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_bextr_i32:
 ; BTVER2:       # BB#0:
@@ -168,10 +168,10 @@
 ;
 ; HASWELL-LABEL: test_bextr_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; HASWELL-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [2:0.50]
 ; HASWELL-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_bextr_i64:
 ; BTVER2:       # BB#0:
@@ -204,10 +204,10 @@
 ;
 ; HASWELL-LABEL: test_blsi_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    blsil (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsil (%rsi), %ecx # sched: [1:0.50]
 ; HASWELL-NEXT:    blsil %edi, %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blsi_i32:
 ; BTVER2:       # BB#0:
@@ -241,10 +241,10 @@
 ;
 ; HASWELL-LABEL: test_blsi_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    blsiq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsiq (%rsi), %rcx # sched: [1:0.50]
 ; HASWELL-NEXT:    blsiq %rdi, %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blsi_i64:
 ; BTVER2:       # BB#0:
@@ -278,10 +278,10 @@
 ;
 ; HASWELL-LABEL: test_blsmsk_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    blsmskl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsmskl (%rsi), %ecx # sched: [1:0.50]
 ; HASWELL-NEXT:    blsmskl %edi, %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blsmsk_i32:
 ; BTVER2:       # BB#0:
@@ -315,10 +315,10 @@
 ;
 ; HASWELL-LABEL: test_blsmsk_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    blsmskq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsmskq (%rsi), %rcx # sched: [1:0.50]
 ; HASWELL-NEXT:    blsmskq %rdi, %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blsmsk_i64:
 ; BTVER2:       # BB#0:
@@ -352,10 +352,10 @@
 ;
 ; HASWELL-LABEL: test_blsr_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    blsrl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsrl (%rsi), %ecx # sched: [1:0.50]
 ; HASWELL-NEXT:    blsrl %edi, %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blsr_i32:
 ; BTVER2:       # BB#0:
@@ -389,10 +389,10 @@
 ;
 ; HASWELL-LABEL: test_blsr_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    blsrq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsrq (%rsi), %rcx # sched: [1:0.50]
 ; HASWELL-NEXT:    blsrq %rdi, %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blsr_i64:
 ; BTVER2:       # BB#0:
@@ -427,11 +427,11 @@
 ;
 ; HASWELL-LABEL: test_cttz_i16:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    tzcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntw (%rsi), %cx # sched: [3:1.00]
 ; HASWELL-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
 ; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cttz_i16:
 ; BTVER2:       # BB#0:
@@ -466,10 +466,10 @@
 ;
 ; HASWELL-LABEL: test_cttz_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    tzcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntl (%rsi), %ecx # sched: [3:1.00]
 ; HASWELL-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
 ; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cttz_i32:
 ; BTVER2:       # BB#0:
@@ -502,10 +502,10 @@
 ;
 ; HASWELL-LABEL: test_cttz_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    tzcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntq (%rsi), %rcx # sched: [3:1.00]
 ; HASWELL-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
 ; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cttz_i64:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/bmi2-schedule.ll b/llvm/test/CodeGen/X86/bmi2-schedule.ll
index ec9820f..eda2960 100644
--- a/llvm/test/CodeGen/X86/bmi2-schedule.ll
+++ b/llvm/test/CodeGen/X86/bmi2-schedule.ll
@@ -15,10 +15,10 @@
 ;
 ; HASWELL-LABEL: test_bzhi_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [1:0.50]
 ; HASWELL-NEXT:    bzhil %edi, %esi, %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_bzhi_i32:
 ; ZNVER1:       # BB#0:
@@ -44,10 +44,10 @@
 ;
 ; HASWELL-LABEL: test_bzhi_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [1:0.50]
 ; HASWELL-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_bzhi_i64:
 ; ZNVER1:       # BB#0:
@@ -80,9 +80,9 @@
 ; HASWELL-NEXT:    movq %rdx, %rax # sched: [1:0.25]
 ; HASWELL-NEXT:    movq %rdi, %rdx # sched: [1:0.25]
 ; HASWELL-NEXT:    mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    mulxq (%rax), %rdx, %rax # sched: [8:1.00]
+; HASWELL-NEXT:    mulxq (%rax), %rdx, %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_mulx_i64:
 ; ZNVER1:       # BB#0:
@@ -116,10 +116,10 @@
 ;
 ; HASWELL-LABEL: test_pdep_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [3:1.00]
 ; HASWELL-NEXT:    pdepl %esi, %edi, %eax # sched: [3:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pdep_i32:
 ; ZNVER1:       # BB#0:
@@ -145,10 +145,10 @@
 ;
 ; HASWELL-LABEL: test_pdep_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [3:1.00]
 ; HASWELL-NEXT:    pdepq %rsi, %rdi, %rax # sched: [3:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pdep_i64:
 ; ZNVER1:       # BB#0:
@@ -174,10 +174,10 @@
 ;
 ; HASWELL-LABEL: test_pext_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    pextl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    pextl (%rdx), %edi, %ecx # sched: [3:1.00]
 ; HASWELL-NEXT:    pextl %esi, %edi, %eax # sched: [3:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pext_i32:
 ; ZNVER1:       # BB#0:
@@ -203,10 +203,10 @@
 ;
 ; HASWELL-LABEL: test_pext_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [3:1.00]
 ; HASWELL-NEXT:    pextq %rsi, %rdi, %rax # sched: [3:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pext_i64:
 ; ZNVER1:       # BB#0:
@@ -233,9 +233,9 @@
 ; HASWELL-LABEL: test_rorx_i32:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    rorxl $5, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    rorxl $5, (%rdx), %eax # sched: [5:0.50]
+; HASWELL-NEXT:    rorxl $5, (%rdx), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_rorx_i32:
 ; ZNVER1:       # BB#0:
@@ -265,9 +265,9 @@
 ; HASWELL-LABEL: test_rorx_i64:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    rorxq $5, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    rorxq $5, (%rdx), %rax # sched: [5:0.50]
+; HASWELL-NEXT:    rorxq $5, (%rdx), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_rorx_i64:
 ; ZNVER1:       # BB#0:
@@ -297,9 +297,9 @@
 ; HASWELL-LABEL: test_sarx_i32:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    sarxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    sarxl %esi, (%rdx), %eax # sched: [5:0.50]
+; HASWELL-NEXT:    sarxl %esi, (%rdx), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sarx_i32:
 ; ZNVER1:       # BB#0:
@@ -325,9 +325,9 @@
 ; HASWELL-LABEL: test_sarx_i64:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    sarxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; HASWELL-NEXT:    sarxq %rsi, (%rdx), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sarx_i64:
 ; ZNVER1:       # BB#0:
@@ -353,9 +353,9 @@
 ; HASWELL-LABEL: test_shlx_i32:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    shlxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    shlxl %esi, (%rdx), %eax # sched: [5:0.50]
+; HASWELL-NEXT:    shlxl %esi, (%rdx), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_shlx_i32:
 ; ZNVER1:       # BB#0:
@@ -381,9 +381,9 @@
 ; HASWELL-LABEL: test_shlx_i64:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    shlxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; HASWELL-NEXT:    shlxq %rsi, (%rdx), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_shlx_i64:
 ; ZNVER1:       # BB#0:
@@ -409,9 +409,9 @@
 ; HASWELL-LABEL: test_shrx_i32:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    shrxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    shrxl %esi, (%rdx), %eax # sched: [5:0.50]
+; HASWELL-NEXT:    shrxl %esi, (%rdx), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_shrx_i32:
 ; ZNVER1:       # BB#0:
@@ -437,9 +437,9 @@
 ; HASWELL-LABEL: test_shrx_i64:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    shrxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; HASWELL-NEXT:    shrxq %rsi, (%rdx), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_shrx_i64:
 ; ZNVER1:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/f16c-schedule.ll b/llvm/test/CodeGen/X86/f16c-schedule.ll
index 1a17b8b..24b7da2 100644
--- a/llvm/test/CodeGen/X86/f16c-schedule.ll
+++ b/llvm/test/CodeGen/X86/f16c-schedule.ll
@@ -23,10 +23,10 @@
 ;
 ; HASWELL-LABEL: test_vcvtph2ps_128:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00]
-; HASWELL-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [1:1.00]
+; HASWELL-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_vcvtph2ps_128:
 ; BTVER2:       # BB#0:
@@ -66,10 +66,10 @@
 ;
 ; HASWELL-LABEL: test_vcvtph2ps_256:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
-; HASWELL-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [1:1.00]
+; HASWELL-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_vcvtph2ps_256:
 ; BTVER2:       # BB#0:
@@ -108,8 +108,8 @@
 ; HASWELL-LABEL: test_vcvtps2ph_128:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_vcvtps2ph_128:
 ; BTVER2:       # BB#0:
@@ -147,10 +147,10 @@
 ;
 ; HASWELL-LABEL: test_vcvtps2ph_256:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [1:?]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [6:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_vcvtps2ph_256:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/lea32-schedule.ll b/llvm/test/CodeGen/X86/lea32-schedule.ll
index eb4e300..32b892b 100644
--- a/llvm/test/CodeGen/X86/lea32-schedule.ll
+++ b/llvm/test/CodeGen/X86/lea32-schedule.ll
@@ -45,7 +45,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HASWELL-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_offset:
 ; BTVER2:       # BB#0:
@@ -97,7 +97,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HASWELL-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_offset_big:
 ; BTVER2:       # BB#0:
@@ -155,7 +155,7 @@
 ; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add:
 ; BTVER2:       # BB#0:
@@ -217,7 +217,7 @@
 ; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl $16, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_offset:
 ; BTVER2:       # BB#0:
@@ -283,7 +283,7 @@
 ; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl $-4096, %eax # imm = 0xF000
 ; HASWELL-NEXT:    # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_offset_big:
 ; BTVER2:       # BB#0:
@@ -338,7 +338,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HASWELL-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul:
 ; BTVER2:       # BB#0:
@@ -393,7 +393,7 @@
 ; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HASWELL-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl $-32, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul_offset:
 ; BTVER2:       # BB#0:
@@ -452,7 +452,7 @@
 ; HASWELL-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl $10000, %eax # imm = 0x2710
 ; HASWELL-NEXT:    # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul_offset_big:
 ; BTVER2:       # BB#0:
@@ -510,7 +510,7 @@
 ; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HASWELL-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale:
 ; BTVER2:       # BB#0:
@@ -573,7 +573,7 @@
 ; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HASWELL-NEXT:    leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl $96, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale_offset:
 ; BTVER2:       # BB#0:
@@ -640,7 +640,7 @@
 ; HASWELL-NEXT:    leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    addl $-1200, %eax # imm = 0xFB50
 ; HASWELL-NEXT:    # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale_offset_big:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/lea64-schedule.ll b/llvm/test/CodeGen/X86/lea64-schedule.ll
index dd629c3..e7b2960 100644
--- a/llvm/test/CodeGen/X86/lea64-schedule.ll
+++ b/llvm/test/CodeGen/X86/lea64-schedule.ll
@@ -40,7 +40,7 @@
 ; HASWELL-LABEL: test_lea_offset:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_offset:
 ; BTVER2:       # BB#0:
@@ -85,7 +85,7 @@
 ; HASWELL-LABEL: test_lea_offset_big:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_offset_big:
 ; BTVER2:       # BB#0:
@@ -131,7 +131,7 @@
 ; HASWELL-LABEL: test_lea_add:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add:
 ; BTVER2:       # BB#0:
@@ -179,7 +179,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq $16, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_offset:
 ; BTVER2:       # BB#0:
@@ -231,7 +231,7 @@
 ; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq $-4096, %rax # imm = 0xF000
 ; HASWELL-NEXT:    # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_offset_big:
 ; BTVER2:       # BB#0:
@@ -277,7 +277,7 @@
 ; HASWELL-LABEL: test_lea_mul:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul:
 ; BTVER2:       # BB#0:
@@ -325,7 +325,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq $-32, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul_offset:
 ; BTVER2:       # BB#0:
@@ -377,7 +377,7 @@
 ; HASWELL-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq $10000, %rax # imm = 0x2710
 ; HASWELL-NEXT:    # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul_offset_big:
 ; BTVER2:       # BB#0:
@@ -423,7 +423,7 @@
 ; HASWELL-LABEL: test_lea_add_scale:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale:
 ; BTVER2:       # BB#0:
@@ -472,7 +472,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq $96, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale_offset:
 ; BTVER2:       # BB#0:
@@ -525,7 +525,7 @@
 ; HASWELL-NEXT:    leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    addq $-1200, %rax # imm = 0xFB50
 ; HASWELL-NEXT:    # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale_offset_big:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/lzcnt-schedule.ll b/llvm/test/CodeGen/X86/lzcnt-schedule.ll
index 64441c7..d321ffb 100644
--- a/llvm/test/CodeGen/X86/lzcnt-schedule.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-schedule.ll
@@ -17,11 +17,11 @@
 ;
 ; HASWELL-LABEL: test_ctlz_i16:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    lzcntw (%rsi), %cx
-; HASWELL-NEXT:    lzcntw %di, %ax
+; HASWELL-NEXT:    lzcntw (%rsi), %cx # sched: [3:1.00]
+; HASWELL-NEXT:    lzcntw %di, %ax # sched: [3:1.00]
 ; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i16:
 ; BTVER2:       # BB#0:
@@ -56,10 +56,10 @@
 ;
 ; HASWELL-LABEL: test_ctlz_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    lzcntl (%rsi), %ecx
-; HASWELL-NEXT:    lzcntl %edi, %eax
+; HASWELL-NEXT:    lzcntl (%rsi), %ecx # sched: [3:1.00]
+; HASWELL-NEXT:    lzcntl %edi, %eax # sched: [3:1.00]
 ; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i32:
 ; BTVER2:       # BB#0:
@@ -92,10 +92,10 @@
 ;
 ; HASWELL-LABEL: test_ctlz_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    lzcntq (%rsi), %rcx
-; HASWELL-NEXT:    lzcntq %rdi, %rax
+; HASWELL-NEXT:    lzcntq (%rsi), %rcx # sched: [3:1.00]
+; HASWELL-NEXT:    lzcntq %rdi, %rax # sched: [3:1.00]
 ; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i64:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/movbe-schedule.ll b/llvm/test/CodeGen/X86/movbe-schedule.ll
index 7091183..9b17f0d 100644
--- a/llvm/test/CodeGen/X86/movbe-schedule.ll
+++ b/llvm/test/CodeGen/X86/movbe-schedule.ll
@@ -33,9 +33,9 @@
 ;
 ; HASWELL-LABEL: test_ctlz_i16:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    movbew (%rdi), %ax # sched: [6:0.50]
+; HASWELL-NEXT:    movbew (%rdi), %ax # sched: [1:0.50]
 ; HASWELL-NEXT:    movbew %si, (%rdx) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i16:
 ; BTVER2:       # BB#0:
@@ -83,7 +83,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    movbel (%rdi), %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    movbel %esi, (%rdx) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i32:
 ; BTVER2:       # BB#0:
@@ -129,9 +129,9 @@
 ;
 ; HASWELL-LABEL: test_ctlz_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    movbeq (%rdi), %rax # sched: [6:0.50]
+; HASWELL-NEXT:    movbeq (%rdi), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    movbeq %rsi, (%rdx) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i64:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index d545b47..38599f6 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -17,7 +17,7 @@
 ; X64-HSW-LABEL: test_mul_by_1:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_1:
 ; X64-JAG:       # BB#0:
@@ -32,7 +32,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_1:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_1:
 ; JAG-NOOPT:       # BB#0:
@@ -63,7 +63,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_2:
 ; X64-JAG:       # BB#0:
@@ -81,7 +81,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_2:
 ; JAG-NOOPT:       # BB#0:
@@ -114,7 +114,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_3:
 ; X64-JAG:       # BB#0:
@@ -131,7 +131,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_3:
 ; JAG-NOOPT:       # BB#0:
@@ -165,7 +165,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_4:
 ; X64-JAG:       # BB#0:
@@ -183,7 +183,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_4:
 ; JAG-NOOPT:       # BB#0:
@@ -216,7 +216,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_5:
 ; X64-JAG:       # BB#0:
@@ -233,7 +233,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_5:
 ; JAG-NOOPT:       # BB#0:
@@ -269,7 +269,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_6:
 ; X64-JAG:       # BB#0:
@@ -285,8 +285,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_6:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_6:
 ; JAG-NOOPT:       # BB#0:
@@ -321,7 +321,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_7:
 ; X64-JAG:       # BB#0:
@@ -337,8 +337,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_7:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_7:
 ; JAG-NOOPT:       # BB#0:
@@ -371,7 +371,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_8:
 ; X64-JAG:       # BB#0:
@@ -389,7 +389,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_8:
 ; JAG-NOOPT:       # BB#0:
@@ -422,7 +422,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_9:
 ; X64-JAG:       # BB#0:
@@ -439,7 +439,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_9:
 ; JAG-NOOPT:       # BB#0:
@@ -475,7 +475,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_10:
 ; X64-JAG:       # BB#0:
@@ -491,8 +491,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_10:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_10:
 ; JAG-NOOPT:       # BB#0:
@@ -527,7 +527,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_11:
 ; X64-JAG:       # BB#0:
@@ -543,8 +543,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_11:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_11:
 ; JAG-NOOPT:       # BB#0:
@@ -577,7 +577,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    shll $2, %edi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_12:
 ; X64-JAG:       # BB#0:
@@ -593,8 +593,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_12:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_12:
 ; JAG-NOOPT:       # BB#0:
@@ -629,7 +629,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_13:
 ; X64-JAG:       # BB#0:
@@ -645,8 +645,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_13:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_13:
 ; JAG-NOOPT:       # BB#0:
@@ -681,7 +681,7 @@
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_14:
 ; X64-JAG:       # BB#0:
@@ -698,8 +698,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_14:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_14:
 ; JAG-NOOPT:       # BB#0:
@@ -732,7 +732,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_15:
 ; X64-JAG:       # BB#0:
@@ -748,8 +748,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_15:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_15:
 ; JAG-NOOPT:       # BB#0:
@@ -782,7 +782,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shll $4, %edi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_16:
 ; X64-JAG:       # BB#0:
@@ -800,7 +800,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    shll $4, %edi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_16:
 ; JAG-NOOPT:       # BB#0:
@@ -838,7 +838,7 @@
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
 ; X64-HSW-NEXT:    shll $4, %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_17:
 ; X64-JAG:       # BB#0:
@@ -855,8 +855,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_17:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_17:
 ; JAG-NOOPT:       # BB#0:
@@ -892,7 +892,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_18:
 ; X64-JAG:       # BB#0:
@@ -908,8 +908,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_18:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_18:
 ; JAG-NOOPT:       # BB#0:
@@ -946,7 +946,7 @@
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    shll $2, %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_19:
 ; X64-JAG:       # BB#0:
@@ -963,8 +963,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_19:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_19:
 ; JAG-NOOPT:       # BB#0:
@@ -997,7 +997,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    shll $2, %edi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_20:
 ; X64-JAG:       # BB#0:
@@ -1013,8 +1013,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_20:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_20:
 ; JAG-NOOPT:       # BB#0:
@@ -1049,7 +1049,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_21:
 ; X64-JAG:       # BB#0:
@@ -1065,8 +1065,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_21:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_21:
 ; JAG-NOOPT:       # BB#0:
@@ -1101,7 +1101,7 @@
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_22:
 ; X64-JAG:       # BB#0:
@@ -1118,8 +1118,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_22:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_22:
 ; JAG-NOOPT:       # BB#0:
@@ -1154,7 +1154,7 @@
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    shll $3, %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_23:
 ; X64-JAG:       # BB#0:
@@ -1171,8 +1171,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_23:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_23:
 ; JAG-NOOPT:       # BB#0:
@@ -1205,7 +1205,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    shll $3, %edi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_24:
 ; X64-JAG:       # BB#0:
@@ -1221,8 +1221,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_24:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_24:
 ; JAG-NOOPT:       # BB#0:
@@ -1257,7 +1257,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_25:
 ; X64-JAG:       # BB#0:
@@ -1273,8 +1273,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_25:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_25:
 ; JAG-NOOPT:       # BB#0:
@@ -1311,7 +1311,7 @@
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_26:
 ; X64-JAG:       # BB#0:
@@ -1328,8 +1328,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_26:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_26:
 ; JAG-NOOPT:       # BB#0:
@@ -1362,7 +1362,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_27:
 ; X64-JAG:       # BB#0:
@@ -1378,8 +1378,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_27:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_27:
 ; JAG-NOOPT:       # BB#0:
@@ -1416,7 +1416,7 @@
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_28:
 ; X64-JAG:       # BB#0:
@@ -1433,8 +1433,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_28:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_28:
 ; JAG-NOOPT:       # BB#0:
@@ -1471,7 +1471,7 @@
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
@@ -1489,8 +1489,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_29:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_29:
 ; JAG-NOOPT:       # BB#0:
@@ -1526,7 +1526,7 @@
 ; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_30:
 ; X64-JAG:       # BB#0:
@@ -1543,8 +1543,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_30:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_30:
 ; JAG-NOOPT:       # BB#0:
@@ -1578,7 +1578,7 @@
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
 ; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_31:
 ; X64-JAG:       # BB#0:
@@ -1594,8 +1594,8 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_31:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_31:
 ; JAG-NOOPT:       # BB#0:
@@ -1628,7 +1628,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shll $5, %edi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_32:
 ; X64-JAG:       # BB#0:
@@ -1646,7 +1646,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    shll $5, %edi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_32:
 ; JAG-NOOPT:       # BB#0:
@@ -1687,7 +1687,7 @@
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl $2, %eax # sched: [1:0.25]
 ; X64-HSW-NEXT:    imull %ecx, %eax # sched: [4:1.00]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_spec:
 ; X64-JAG:       # BB#0:
@@ -1713,7 +1713,7 @@
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    addl $2, %eax # sched: [1:0.25]
 ; HSW-NOOPT-NEXT:    imull %ecx, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_spec:
 ; JAG-NOOPT:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll
index ea841c7..98568a6 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i64.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll
@@ -18,7 +18,7 @@
 ; X64-HSW-LABEL: test_mul_by_1:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_1:
 ; X64-JAG:       # BB#0:
@@ -34,7 +34,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_1:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_1:
 ; JAG-NOOPT:       # BB#0:
@@ -66,7 +66,7 @@
 ; X64-HSW-LABEL: test_mul_by_2:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_2:
 ; X64-JAG:       # BB#0:
@@ -84,7 +84,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_2:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_2:
 ; JAG-NOOPT:       # BB#0:
@@ -116,7 +116,7 @@
 ; X64-HSW-LABEL: test_mul_by_3:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_3:
 ; X64-JAG:       # BB#0:
@@ -134,7 +134,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_3:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_3:
 ; JAG-NOOPT:       # BB#0:
@@ -166,7 +166,7 @@
 ; X64-HSW-LABEL: test_mul_by_4:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_4:
 ; X64-JAG:       # BB#0:
@@ -184,7 +184,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_4:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_4:
 ; JAG-NOOPT:       # BB#0:
@@ -216,7 +216,7 @@
 ; X64-HSW-LABEL: test_mul_by_5:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_5:
 ; X64-JAG:       # BB#0:
@@ -234,7 +234,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_5:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_5:
 ; JAG-NOOPT:       # BB#0:
@@ -268,7 +268,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_6:
 ; X64-JAG:       # BB#0:
@@ -287,7 +287,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_6:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $6, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_6:
 ; JAG-NOOPT:       # BB#0:
@@ -323,7 +323,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_7:
 ; X64-JAG:       # BB#0:
@@ -342,7 +342,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_7:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $7, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_7:
 ; JAG-NOOPT:       # BB#0:
@@ -375,7 +375,7 @@
 ; X64-HSW-LABEL: test_mul_by_8:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_8:
 ; X64-JAG:       # BB#0:
@@ -393,7 +393,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_8:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_8:
 ; JAG-NOOPT:       # BB#0:
@@ -425,7 +425,7 @@
 ; X64-HSW-LABEL: test_mul_by_9:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_9:
 ; X64-JAG:       # BB#0:
@@ -443,7 +443,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_9:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_9:
 ; JAG-NOOPT:       # BB#0:
@@ -477,7 +477,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_10:
 ; X64-JAG:       # BB#0:
@@ -496,7 +496,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_10:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $10, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_10:
 ; JAG-NOOPT:       # BB#0:
@@ -532,7 +532,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_11:
 ; X64-JAG:       # BB#0:
@@ -551,7 +551,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_11:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_11:
 ; JAG-NOOPT:       # BB#0:
@@ -585,7 +585,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $2, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_12:
 ; X64-JAG:       # BB#0:
@@ -604,7 +604,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_12:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $12, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_12:
 ; JAG-NOOPT:       # BB#0:
@@ -640,7 +640,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_13:
 ; X64-JAG:       # BB#0:
@@ -659,7 +659,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_13:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_13:
 ; JAG-NOOPT:       # BB#0:
@@ -696,7 +696,7 @@
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_14:
 ; X64-JAG:       # BB#0:
@@ -716,7 +716,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_14:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_14:
 ; JAG-NOOPT:       # BB#0:
@@ -751,7 +751,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_15:
 ; X64-JAG:       # BB#0:
@@ -770,7 +770,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_15:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $15, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_15:
 ; JAG-NOOPT:       # BB#0:
@@ -804,7 +804,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $4, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_16:
 ; X64-JAG:       # BB#0:
@@ -824,7 +824,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_16:
 ; JAG-NOOPT:       # BB#0:
@@ -864,7 +864,7 @@
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    shlq $4, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_17:
 ; X64-JAG:       # BB#0:
@@ -884,7 +884,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_17:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $17, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_17:
 ; JAG-NOOPT:       # BB#0:
@@ -920,7 +920,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_18:
 ; X64-JAG:       # BB#0:
@@ -939,7 +939,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_18:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $18, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_18:
 ; JAG-NOOPT:       # BB#0:
@@ -977,7 +977,7 @@
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    shlq $2, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_19:
 ; X64-JAG:       # BB#0:
@@ -997,7 +997,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_19:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_19:
 ; JAG-NOOPT:       # BB#0:
@@ -1031,7 +1031,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $2, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_20:
 ; X64-JAG:       # BB#0:
@@ -1050,7 +1050,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_20:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $20, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_20:
 ; JAG-NOOPT:       # BB#0:
@@ -1086,7 +1086,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_21:
 ; X64-JAG:       # BB#0:
@@ -1105,7 +1105,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_21:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_21:
 ; JAG-NOOPT:       # BB#0:
@@ -1142,7 +1142,7 @@
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_22:
 ; X64-JAG:       # BB#0:
@@ -1162,7 +1162,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_22:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_22:
 ; JAG-NOOPT:       # BB#0:
@@ -1199,7 +1199,7 @@
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    shlq $3, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_23:
 ; X64-JAG:       # BB#0:
@@ -1219,7 +1219,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_23:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_23:
 ; JAG-NOOPT:       # BB#0:
@@ -1253,7 +1253,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $3, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_24:
 ; X64-JAG:       # BB#0:
@@ -1272,7 +1272,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_24:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $24, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_24:
 ; JAG-NOOPT:       # BB#0:
@@ -1308,7 +1308,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_25:
 ; X64-JAG:       # BB#0:
@@ -1327,7 +1327,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_25:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $25, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_25:
 ; JAG-NOOPT:       # BB#0:
@@ -1365,7 +1365,7 @@
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_26:
 ; X64-JAG:       # BB#0:
@@ -1385,7 +1385,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_26:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_26:
 ; JAG-NOOPT:       # BB#0:
@@ -1420,7 +1420,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_27:
 ; X64-JAG:       # BB#0:
@@ -1439,7 +1439,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_27:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $27, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_27:
 ; JAG-NOOPT:       # BB#0:
@@ -1477,7 +1477,7 @@
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_28:
 ; X64-JAG:       # BB#0:
@@ -1497,7 +1497,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_28:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_28:
 ; JAG-NOOPT:       # BB#0:
@@ -1536,7 +1536,7 @@
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
@@ -1557,7 +1557,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_29:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_29:
 ; JAG-NOOPT:       # BB#0:
@@ -1596,7 +1596,7 @@
 ; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_30:
 ; X64-JAG:       # BB#0:
@@ -1617,7 +1617,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_30:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_30:
 ; JAG-NOOPT:       # BB#0:
@@ -1654,7 +1654,7 @@
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_31:
 ; X64-JAG:       # BB#0:
@@ -1674,7 +1674,7 @@
 ; HSW-NOOPT-LABEL: test_mul_by_31:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $31, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_31:
 ; JAG-NOOPT:       # BB#0:
@@ -1709,7 +1709,7 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $5, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_32:
 ; X64-JAG:       # BB#0:
@@ -1729,7 +1729,7 @@
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_32:
 ; JAG-NOOPT:       # BB#0:
@@ -1792,8 +1792,8 @@
 ; X64-HSW-NEXT:    addq $42, %rcx # sched: [1:0.25]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq $2, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    imulq %rcx, %rax # sched: [4:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_spec:
 ; X64-JAG:       # BB#0:
@@ -1840,8 +1840,8 @@
 ; HSW-NOOPT-NEXT:    addq $42, %rcx # sched: [1:0.25]
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    addq $2, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imulq %rcx, %rax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_spec:
 ; JAG-NOOPT:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/popcnt-schedule.ll b/llvm/test/CodeGen/X86/popcnt-schedule.ll
index e625748..f393e62 100644
--- a/llvm/test/CodeGen/X86/popcnt-schedule.ll
+++ b/llvm/test/CodeGen/X86/popcnt-schedule.ll
@@ -37,11 +37,11 @@
 ;
 ; HASWELL-LABEL: test_ctpop_i16:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    popcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntw (%rsi), %cx # sched: [3:1.00]
 ; HASWELL-NEXT:    popcntw %di, %ax # sched: [3:1.00]
 ; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctpop_i16:
 ; BTVER2:       # BB#0:
@@ -90,10 +90,10 @@
 ;
 ; HASWELL-LABEL: test_ctpop_i32:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    popcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntl (%rsi), %ecx # sched: [3:1.00]
 ; HASWELL-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
 ; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctpop_i32:
 ; BTVER2:       # BB#0:
@@ -140,10 +140,10 @@
 ;
 ; HASWELL-LABEL: test_ctpop_i64:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    popcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntq (%rsi), %rcx # sched: [3:1.00]
 ; HASWELL-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
 ; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ctpop_i64:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll
index e07340b..7983bc9 100644
--- a/llvm/test/CodeGen/X86/pr32329.ll
+++ b/llvm/test/CodeGen/X86/pr32329.ll
@@ -36,33 +36,33 @@
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:  .Lcfi7:
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl obj, %edx
 ; X86-NEXT:    movsbl var_27, %eax
-; X86-NEXT:    movzwl var_2, %esi
 ; X86-NEXT:    movl var_310, %ecx
 ; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl obj, %esi
 ; X86-NEXT:    addl var_24, %ecx
-; X86-NEXT:    andl $4194303, %edx # imm = 0x3FFFFF
-; X86-NEXT:    leal (%edx,%edx), %ebx
-; X86-NEXT:    subl %eax, %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    subl %esi, %edi
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movzwl var_2, %edi
+; X86-NEXT:    andl $4194303, %esi # imm = 0x3FFFFF
+; X86-NEXT:    leal (%esi,%esi), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    addl $-1437483407, %ecx # imm = 0xAA51BE71
-; X86-NEXT:    movl $9, %esi
+; X86-NEXT:    movl $9, %edi
 ; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    shldl %cl, %esi, %ebp
-; X86-NEXT:    shlxl %ecx, %esi, %esi
+; X86-NEXT:    shldl %cl, %edi, %ebp
+; X86-NEXT:    shlxl %ecx, %edi, %edi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    cmovnel %edi, %ebp
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovnel %ecx, %esi
-; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    cmovnel %ecx, %edi
 ; X86-NEXT:    movl %ebp, var_50+4
-; X86-NEXT:    movl %esi, var_50
+; X86-NEXT:    cmpl %esi, %ebx
 ; X86-NEXT:    setge var_205
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    movb %bl, var_218
+; X86-NEXT:    imull %eax, %edx
+; X86-NEXT:    movl %edi, var_50
+; X86-NEXT:    movb %dl, var_218
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -71,25 +71,25 @@
 ;
 ; X64-LABEL: foo:
 ; X64:       # BB#0: # %entry
-; X64-NEXT:    movl {{.*}}(%rip), %eax
-; X64-NEXT:    movsbl {{.*}}(%rip), %r9d
+; X64-NEXT:    movsbl {{.*}}(%rip), %eax
+; X64-NEXT:    movl {{.*}}(%rip), %ecx
+; X64-NEXT:    imull %eax, %ecx
+; X64-NEXT:    movl {{.*}}(%rip), %edx
+; X64-NEXT:    addl {{.*}}(%rip), %ecx
 ; X64-NEXT:    movzwl {{.*}}(%rip), %r8d
-; X64-NEXT:    movl {{.*}}(%rip), %esi
-; X64-NEXT:    imull %r9d, %esi
-; X64-NEXT:    addl {{.*}}(%rip), %esi
-; X64-NEXT:    andl $4194303, %eax # imm = 0x3FFFFF
-; X64-NEXT:    leal (%rax,%rax), %edi
-; X64-NEXT:    subl %r9d, %edi
-; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    subl %r8d, %edx
-; X64-NEXT:    imull %edx, %esi
-; X64-NEXT:    addl $-1437483407, %esi # imm = 0xAA51BE71
-; X64-NEXT:    movl $9, %ecx
-; X64-NEXT:    shlxq %rsi, %rcx, %rcx
-; X64-NEXT:    movq %rcx, {{.*}}(%rip)
-; X64-NEXT:    cmpl %eax, %edx
+; X64-NEXT:    andl $4194303, %edx # imm = 0x3FFFFF
+; X64-NEXT:    leal (%rdx,%rdx), %edi
+; X64-NEXT:    subl %eax, %edi
+; X64-NEXT:    movl %edi, %esi
+; X64-NEXT:    subl %r8d, %esi
+; X64-NEXT:    imull %esi, %ecx
+; X64-NEXT:    addl $-1437483407, %ecx # imm = 0xAA51BE71
+; X64-NEXT:    movl $9, %r8d
+; X64-NEXT:    cmpl %edx, %esi
 ; X64-NEXT:    setge {{.*}}(%rip)
-; X64-NEXT:    imull %r9d, %edi
+; X64-NEXT:    shlxq %rcx, %r8, %rcx
+; X64-NEXT:    imull %eax, %edi
+; X64-NEXT:    movq %rcx, {{.*}}(%rip)
 ; X64-NEXT:    movb %dil, {{.*}}(%rip)
 ; X64-NEXT:    retq
   entry:
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index 437ac3c..e33c688 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -51,9 +51,9 @@
 ;
 ; HASWELL-LABEL: f32_no_estimate:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_no_estimate:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -63,9 +63,9 @@
 ;
 ; AVX512-LABEL: f32_no_estimate:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; AVX512-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; AVX512-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -122,9 +122,9 @@
 ; HASWELL-LABEL: f32_one_step:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -139,9 +139,9 @@
 ; AVX512-LABEL: f32_one_step:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -220,13 +220,13 @@
 ; HASWELL-LABEL: f32_two_step:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -245,13 +245,13 @@
 ; AVX512-LABEL: f32_two_step:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -290,9 +290,9 @@
 ;
 ; HASWELL-LABEL: v4f32_no_estimate:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -302,9 +302,9 @@
 ;
 ; AVX512-LABEL: v4f32_no_estimate:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
-; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50]
+; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -361,10 +361,10 @@
 ; HASWELL-LABEL: v4f32_one_step:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -379,17 +379,17 @@
 ; KNL-LABEL: v4f32_one_step:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_one_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -468,13 +468,13 @@
 ; HASWELL-LABEL: v4f32_two_step:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -493,24 +493,24 @@
 ; KNL-LABEL: v4f32_two_step:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_two_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -552,9 +552,9 @@
 ;
 ; HASWELL-LABEL: v8f32_no_estimate:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -564,9 +564,9 @@
 ;
 ; AVX512-LABEL: v8f32_no_estimate:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -629,11 +629,11 @@
 ;
 ; HASWELL-LABEL: v8f32_one_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -647,18 +647,18 @@
 ;
 ; KNL-LABEL: v8f32_one_step:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_one_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -749,14 +749,14 @@
 ;
 ; HASWELL-LABEL: v8f32_two_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -774,25 +774,25 @@
 ;
 ; KNL-LABEL: v8f32_two_step:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_two_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 7e159c0..ac63974 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -45,20 +45,20 @@
 ; HASWELL-LABEL: f32_no_step_2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_no_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; AVX512-LABEL: f32_no_step_2:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 1234.0, %x
   ret float %div
 }
@@ -120,29 +120,29 @@
 ; HASWELL-LABEL: f32_one_step_2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; AVX512-LABEL: f32_one_step_2:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 3456.0, %x
   ret float %div
 }
@@ -209,32 +209,32 @@
 ; HASWELL-LABEL: f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; AVX512-LABEL: f32_one_step_2_divs:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 3456.0, %x
   %div2 = fdiv fast float %div, %x
   ret float %div2
@@ -319,20 +319,20 @@
 ; HASWELL-LABEL: f32_two_step_2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -340,20 +340,20 @@
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; AVX512-LABEL: f32_two_step_2:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 6789.0, %x
   ret float %div
 }
@@ -415,39 +415,39 @@
 ; HASWELL-LABEL: v4f32_one_step2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v4f32_one_step2:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_one_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -514,43 +514,43 @@
 ; HASWELL-LABEL: v4f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v4f32_one_step_2_divs:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_one_step_2_divs:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   %div2 = fdiv fast <4 x float> %div, %x
   ret <4 x float> %div2
@@ -635,20 +635,20 @@
 ; HASWELL-LABEL: v4f32_two_step2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -656,32 +656,32 @@
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v4f32_two_step2:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_two_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -750,40 +750,40 @@
 ;
 ; HASWELL-LABEL: v8f32_one_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_one_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_one_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -858,44 +858,44 @@
 ;
 ; HASWELL-LABEL: v8f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_one_step_2_divs:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_one_step_2_divs:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   %div2 = fdiv fast <8 x float> %div, %x
   ret <8 x float> %div2
@@ -993,54 +993,54 @@
 ;
 ; HASWELL-LABEL: v8f32_two_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_two_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_two_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -1074,23 +1074,23 @@
 ;
 ; HASWELL-LABEL: v8f32_no_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_no_step:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_no_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -1130,27 +1130,27 @@
 ;
 ; HASWELL-LABEL: v8f32_no_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_no_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_no_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
diff --git a/llvm/test/CodeGen/X86/sha-schedule.ll b/llvm/test/CodeGen/X86/sha-schedule.ll
index bf82ba8..bd9be7e 100644
--- a/llvm/test/CodeGen/X86/sha-schedule.ll
+++ b/llvm/test/CodeGen/X86/sha-schedule.ll
@@ -25,7 +25,7 @@
 ; CANNONLAKE:       # BB#0:
 ; CANNONLAKE-NEXT:    sha1msg1 %xmm1, %xmm0
 ; CANNONLAKE-NEXT:    sha1msg1 (%rdi), %xmm0
-; CANNONLAKE-NEXT:    retq # sched: [1:1.00]
+; CANNONLAKE-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sha1msg1:
 ; ZNVER1:       # BB#0:
@@ -56,7 +56,7 @@
 ; CANNONLAKE:       # BB#0:
 ; CANNONLAKE-NEXT:    sha1msg2 %xmm1, %xmm0
 ; CANNONLAKE-NEXT:    sha1msg2 (%rdi), %xmm0
-; CANNONLAKE-NEXT:    retq # sched: [1:1.00]
+; CANNONLAKE-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sha1msg2:
 ; ZNVER1:       # BB#0:
@@ -87,7 +87,7 @@
 ; CANNONLAKE:       # BB#0:
 ; CANNONLAKE-NEXT:    sha1nexte %xmm1, %xmm0
 ; CANNONLAKE-NEXT:    sha1nexte (%rdi), %xmm0
-; CANNONLAKE-NEXT:    retq # sched: [1:1.00]
+; CANNONLAKE-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sha1nexte:
 ; ZNVER1:       # BB#0:
@@ -118,7 +118,7 @@
 ; CANNONLAKE:       # BB#0:
 ; CANNONLAKE-NEXT:    sha1rnds4 $3, %xmm1, %xmm0
 ; CANNONLAKE-NEXT:    sha1rnds4 $3, (%rdi), %xmm0
-; CANNONLAKE-NEXT:    retq # sched: [1:1.00]
+; CANNONLAKE-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sha1rnds4:
 ; ZNVER1:       # BB#0:
@@ -153,7 +153,7 @@
 ; CANNONLAKE:       # BB#0:
 ; CANNONLAKE-NEXT:    sha256msg1 %xmm1, %xmm0
 ; CANNONLAKE-NEXT:    sha256msg1 (%rdi), %xmm0
-; CANNONLAKE-NEXT:    retq # sched: [1:1.00]
+; CANNONLAKE-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sha256msg1:
 ; ZNVER1:       # BB#0:
@@ -184,7 +184,7 @@
 ; CANNONLAKE:       # BB#0:
 ; CANNONLAKE-NEXT:    sha256msg2 %xmm1, %xmm0
 ; CANNONLAKE-NEXT:    sha256msg2 (%rdi), %xmm0
-; CANNONLAKE-NEXT:    retq # sched: [1:1.00]
+; CANNONLAKE-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sha256msg2:
 ; ZNVER1:       # BB#0:
@@ -224,7 +224,7 @@
 ; CANNONLAKE-NEXT:    sha256rnds2 %xmm0, %xmm1, %xmm3
 ; CANNONLAKE-NEXT:    sha256rnds2 %xmm0, (%rdi), %xmm3
 ; CANNONLAKE-NEXT:    vmovaps %xmm3, %xmm0 # sched: [1:1.00]
-; CANNONLAKE-NEXT:    retq # sched: [1:1.00]
+; CANNONLAKE-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_sha256rnds2:
 ; ZNVER1:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse-schedule.ll b/llvm/test/CodeGen/X86/sse-schedule.ll
index 2ddefa1..8eb7b3f 100644
--- a/llvm/test/CodeGen/X86/sse-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse-schedule.ll
@@ -37,8 +37,8 @@
 ; HASWELL-LABEL: test_addps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addps:
 ; BTVER2:       # BB#0:
@@ -85,8 +85,8 @@
 ; HASWELL-LABEL: test_addss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addss:
 ; BTVER2:       # BB#0:
@@ -137,8 +137,8 @@
 ; HASWELL-LABEL: test_andps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andps:
 ; BTVER2:       # BB#0:
@@ -193,8 +193,8 @@
 ; HASWELL-LABEL: test_andnotps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andnotps:
 ; BTVER2:       # BB#0:
@@ -251,9 +251,9 @@
 ; HASWELL-LABEL: test_cmpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmpps:
 ; BTVER2:       # BB#0:
@@ -306,7 +306,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmpss:
 ; BTVER2:       # BB#0:
@@ -399,7 +399,7 @@
 ; HASWELL-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_comiss:
 ; BTVER2:       # BB#0:
@@ -470,7 +470,7 @@
 ; HASWELL-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
 ; HASWELL-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; HASWELL-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2ss:
 ; BTVER2:       # BB#0:
@@ -523,10 +523,10 @@
 ;
 ; HASWELL-LABEL: test_cvtsi2ssq:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
 ; HASWELL-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; HASWELL-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2ssq:
 ; BTVER2:       # BB#0:
@@ -580,9 +580,9 @@
 ; HASWELL-LABEL: test_cvtss2si:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtss2si (%rdi), %eax # sched: [4:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2si:
 ; BTVER2:       # BB#0:
@@ -639,9 +639,9 @@
 ; HASWELL-LABEL: test_cvtss2siq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtss2si (%rdi), %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2siq:
 ; BTVER2:       # BB#0:
@@ -698,9 +698,9 @@
 ; HASWELL-LABEL: test_cvttss2si:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttss2si (%rdi), %eax # sched: [4:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttss2si:
 ; BTVER2:       # BB#0:
@@ -754,9 +754,9 @@
 ; HASWELL-LABEL: test_cvttss2siq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttss2si (%rdi), %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttss2siq:
 ; BTVER2:       # BB#0:
@@ -805,9 +805,9 @@
 ;
 ; HASWELL-LABEL: test_divps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divps:
 ; BTVER2:       # BB#0:
@@ -853,9 +853,9 @@
 ;
 ; HASWELL-LABEL: test_divss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divss:
 ; BTVER2:       # BB#0:
@@ -902,8 +902,8 @@
 ; HASWELL-LABEL: test_ldmxcsr:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; HASWELL-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [6:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ldmxcsr:
 ; BTVER2:       # BB#0:
@@ -952,8 +952,8 @@
 ; HASWELL-LABEL: test_maxps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxps:
 ; BTVER2:       # BB#0:
@@ -1001,8 +1001,8 @@
 ; HASWELL-LABEL: test_maxss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxss:
 ; BTVER2:       # BB#0:
@@ -1050,8 +1050,8 @@
 ; HASWELL-LABEL: test_minps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minps:
 ; BTVER2:       # BB#0:
@@ -1099,8 +1099,8 @@
 ; HASWELL-LABEL: test_minss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minss:
 ; BTVER2:       # BB#0:
@@ -1151,10 +1151,10 @@
 ;
 ; HASWELL-LABEL: test_movaps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovaps (%rdi), %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movaps:
 ; BTVER2:       # BB#0:
@@ -1207,7 +1207,7 @@
 ; HASWELL-LABEL: test_movhlps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movhlps:
 ; BTVER2:       # BB#0:
@@ -1257,10 +1257,10 @@
 ;
 ; HASWELL-LABEL: test_movhps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movhps:
 ; BTVER2:       # BB#0:
@@ -1316,7 +1316,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movlhps:
 ; BTVER2:       # BB#0:
@@ -1365,10 +1365,10 @@
 ;
 ; HASWELL-LABEL: test_movlps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movlps:
 ; BTVER2:       # BB#0:
@@ -1419,7 +1419,7 @@
 ; HASWELL-LABEL: test_movmskps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovmskps %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movmskps:
 ; BTVER2:       # BB#0:
@@ -1465,7 +1465,7 @@
 ; HASWELL-LABEL: test_movntps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntps:
 ; BTVER2:       # BB#0:
@@ -1511,10 +1511,10 @@
 ;
 ; HASWELL-LABEL: test_movss_mem:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movss_mem:
 ; BTVER2:       # BB#0:
@@ -1565,7 +1565,7 @@
 ; HASWELL-LABEL: test_movss_reg:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movss_reg:
 ; BTVER2:       # BB#0:
@@ -1611,10 +1611,10 @@
 ;
 ; HASWELL-LABEL: test_movups:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovups (%rdi), %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movups:
 ; BTVER2:       # BB#0:
@@ -1663,8 +1663,8 @@
 ; HASWELL-LABEL: test_mulps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulps:
 ; BTVER2:       # BB#0:
@@ -1711,8 +1711,8 @@
 ; HASWELL-LABEL: test_mulss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulss:
 ; BTVER2:       # BB#0:
@@ -1763,8 +1763,8 @@
 ; HASWELL-LABEL: test_orps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_orps:
 ; BTVER2:       # BB#0:
@@ -1816,8 +1816,8 @@
 ;
 ; HASWELL-LABEL: test_prefetchnta:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    prefetchnta (%rdi) # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    prefetchnta (%rdi) # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_prefetchnta:
 ; BTVER2:       # BB#0:
@@ -1867,9 +1867,9 @@
 ; HASWELL-LABEL: test_rcpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT:    vrcpps (%rdi), %xmm1 # sched: [5:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rcpps:
 ; BTVER2:       # BB#0:
@@ -1929,11 +1929,11 @@
 ;
 ; HASWELL-LABEL: test_rcpss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; HASWELL-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; HASWELL-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rcpss:
 ; BTVER2:       # BB#0:
@@ -1994,9 +1994,9 @@
 ; HASWELL-LABEL: test_rsqrtps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [5:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtps:
 ; BTVER2:       # BB#0:
@@ -2057,10 +2057,10 @@
 ; HASWELL-LABEL: test_rsqrtss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtss:
 ; BTVER2:       # BB#0:
@@ -2116,8 +2116,8 @@
 ;
 ; HASWELL-LABEL: test_sfence:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    sfence # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    sfence # sched: [1:0.33]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sfence:
 ; BTVER2:       # BB#0:
@@ -2165,8 +2165,8 @@
 ; HASWELL-LABEL: test_shufps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; HASWELL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_shufps:
 ; BTVER2:       # BB#0:
@@ -2217,10 +2217,10 @@
 ;
 ; HASWELL-LABEL: test_sqrtps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [14:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtps:
 ; BTVER2:       # BB#0:
@@ -2280,11 +2280,11 @@
 ;
 ; HASWELL-LABEL: test_sqrtss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT:    vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT:    vmovaps (%rdi), %xmm1 # sched: [1:0.50]
+; HASWELL-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtss:
 ; BTVER2:       # BB#0:
@@ -2336,9 +2336,9 @@
 ;
 ; HASWELL-LABEL: test_stmxcsr:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
-; HASWELL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; HASWELL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_stmxcsr:
 ; BTVER2:       # BB#0:
@@ -2387,8 +2387,8 @@
 ; HASWELL-LABEL: test_subps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subps:
 ; BTVER2:       # BB#0:
@@ -2435,8 +2435,8 @@
 ; HASWELL-LABEL: test_subss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subss:
 ; BTVER2:       # BB#0:
@@ -2524,7 +2524,7 @@
 ; HASWELL-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ucomiss:
 ; BTVER2:       # BB#0:
@@ -2593,8 +2593,8 @@
 ; HASWELL-LABEL: test_unpckhps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhps:
 ; BTVER2:       # BB#0:
@@ -2645,8 +2645,8 @@
 ; HASWELL-LABEL: test_unpcklps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklps:
 ; BTVER2:       # BB#0:
@@ -2697,8 +2697,8 @@
 ; HASWELL-LABEL: test_xorps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_xorps:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse2-schedule.ll b/llvm/test/CodeGen/X86/sse2-schedule.ll
index 28d8b18..23533a7 100644
--- a/llvm/test/CodeGen/X86/sse2-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse2-schedule.ll
@@ -37,8 +37,8 @@
 ; HASWELL-LABEL: test_addpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addpd:
 ; BTVER2:       # BB#0:
@@ -85,8 +85,8 @@
 ; HASWELL-LABEL: test_addsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addsd:
 ; BTVER2:       # BB#0:
@@ -137,9 +137,9 @@
 ; HASWELL-LABEL: test_andpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andpd:
 ; BTVER2:       # BB#0:
@@ -197,9 +197,9 @@
 ; HASWELL-LABEL: test_andnotpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andnotpd:
 ; BTVER2:       # BB#0:
@@ -259,9 +259,9 @@
 ; HASWELL-LABEL: test_cmppd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmppd:
 ; BTVER2:       # BB#0:
@@ -314,7 +314,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmpsd:
 ; BTVER2:       # BB#0:
@@ -407,7 +407,7 @@
 ; HASWELL-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_comisd:
 ; BTVER2:       # BB#0:
@@ -476,9 +476,9 @@
 ; HASWELL-LABEL: test_cvtdq2pd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [4:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2pd:
 ; BTVER2:       # BB#0:
@@ -534,10 +534,10 @@
 ;
 ; HASWELL-LABEL: test_cvtdq2ps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2ps:
 ; BTVER2:       # BB#0:
@@ -592,9 +592,9 @@
 ; HASWELL-LABEL: test_cvtpd2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2dq:
 ; BTVER2:       # BB#0:
@@ -650,9 +650,9 @@
 ; HASWELL-LABEL: test_cvtpd2ps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2ps:
 ; BTVER2:       # BB#0:
@@ -708,9 +708,9 @@
 ; HASWELL-LABEL: test_cvtps2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2dq:
 ; BTVER2:       # BB#0:
@@ -766,9 +766,9 @@
 ; HASWELL-LABEL: test_cvtps2pd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2pd:
 ; BTVER2:       # BB#0:
@@ -824,9 +824,9 @@
 ; HASWELL-LABEL: test_cvtsd2si:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtsd2si (%rdi), %eax # sched: [4:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2si:
 ; BTVER2:       # BB#0:
@@ -883,9 +883,9 @@
 ; HASWELL-LABEL: test_cvtsd2siq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtsd2si (%rdi), %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2siq:
 ; BTVER2:       # BB#0:
@@ -947,10 +947,10 @@
 ; HASWELL-LABEL: test_cvtsd2ss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50]
 ; HASWELL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
 ; HASWELL-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2ss:
 ; BTVER2:       # BB#0:
@@ -1008,7 +1008,7 @@
 ; HASWELL-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
 ; HASWELL-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; HASWELL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2sd:
 ; BTVER2:       # BB#0:
@@ -1064,7 +1064,7 @@
 ; HASWELL-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
 ; HASWELL-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; HASWELL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2sdq:
 ; BTVER2:       # BB#0:
@@ -1125,10 +1125,10 @@
 ; HASWELL-LABEL: test_cvtss2sd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00]
 ; HASWELL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2sd:
 ; BTVER2:       # BB#0:
@@ -1185,9 +1185,9 @@
 ; HASWELL-LABEL: test_cvttpd2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttpd2dq:
 ; BTVER2:       # BB#0:
@@ -1244,9 +1244,9 @@
 ; HASWELL-LABEL: test_cvttps2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttps2dq:
 ; BTVER2:       # BB#0:
@@ -1300,9 +1300,9 @@
 ; HASWELL-LABEL: test_cvttsd2si:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttsd2si (%rdi), %eax # sched: [4:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttsd2si:
 ; BTVER2:       # BB#0:
@@ -1356,9 +1356,9 @@
 ; HASWELL-LABEL: test_cvttsd2siq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttsd2si (%rdi), %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttsd2siq:
 ; BTVER2:       # BB#0:
@@ -1407,9 +1407,9 @@
 ;
 ; HASWELL-LABEL: test_divpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divpd:
 ; BTVER2:       # BB#0:
@@ -1455,9 +1455,9 @@
 ;
 ; HASWELL-LABEL: test_divsd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divsd:
 ; BTVER2:       # BB#0:
@@ -1505,8 +1505,8 @@
 ;
 ; HASWELL-LABEL: test_lfence:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    lfence # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    lfence # sched: [2:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lfence:
 ; BTVER2:       # BB#0:
@@ -1551,8 +1551,8 @@
 ;
 ; HASWELL-LABEL: test_mfence:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    mfence # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    mfence # sched: [2:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mfence:
 ; BTVER2:       # BB#0:
@@ -1595,8 +1595,8 @@
 ;
 ; HASWELL-LABEL: test_maskmovdqu:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovdqu:
 ; BTVER2:       # BB#0:
@@ -1640,8 +1640,8 @@
 ; HASWELL-LABEL: test_maxpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxpd:
 ; BTVER2:       # BB#0:
@@ -1689,8 +1689,8 @@
 ; HASWELL-LABEL: test_maxsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxsd:
 ; BTVER2:       # BB#0:
@@ -1738,8 +1738,8 @@
 ; HASWELL-LABEL: test_minpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minpd:
 ; BTVER2:       # BB#0:
@@ -1787,8 +1787,8 @@
 ; HASWELL-LABEL: test_minsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minsd:
 ; BTVER2:       # BB#0:
@@ -1839,10 +1839,10 @@
 ;
 ; HASWELL-LABEL: test_movapd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovapd (%rdi), %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movapd:
 ; BTVER2:       # BB#0:
@@ -1894,10 +1894,10 @@
 ;
 ; HASWELL-LABEL: test_movdqa:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movdqa:
 ; BTVER2:       # BB#0:
@@ -1949,10 +1949,10 @@
 ;
 ; HASWELL-LABEL: test_movdqu:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movdqu:
 ; BTVER2:       # BB#0:
@@ -2017,12 +2017,12 @@
 ; HASWELL-LABEL: test_movd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovd %edi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vmovd %xmm0, %eax # sched: [1:1.00]
 ; HASWELL-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movd:
 ; BTVER2:       # BB#0:
@@ -2098,12 +2098,12 @@
 ; HASWELL-LABEL: test_movd_64:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovq %rdi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vmovq %xmm0, %rax # sched: [1:1.00]
 ; HASWELL-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movd_64:
 ; BTVER2:       # BB#0:
@@ -2166,10 +2166,10 @@
 ;
 ; HASWELL-LABEL: test_movhpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movhpd:
 ; BTVER2:       # BB#0:
@@ -2224,10 +2224,10 @@
 ;
 ; HASWELL-LABEL: test_movlpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movlpd:
 ; BTVER2:       # BB#0:
@@ -2277,7 +2277,7 @@
 ; HASWELL-LABEL: test_movmskpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovmskpd %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movmskpd:
 ; BTVER2:       # BB#0:
@@ -2324,7 +2324,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntdqa:
 ; BTVER2:       # BB#0:
@@ -2371,7 +2371,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntpd:
 ; BTVER2:       # BB#0:
@@ -2420,10 +2420,10 @@
 ;
 ; HASWELL-LABEL: test_movq_mem:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movq_mem:
 ; BTVER2:       # BB#0:
@@ -2477,7 +2477,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
 ; HASWELL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movq_reg:
 ; BTVER2:       # BB#0:
@@ -2526,10 +2526,10 @@
 ;
 ; HASWELL-LABEL: test_movsd_mem:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50]
 ; HASWELL-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movsd_mem:
 ; BTVER2:       # BB#0:
@@ -2581,7 +2581,7 @@
 ; HASWELL-LABEL: test_movsd_reg:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movsd_reg:
 ; BTVER2:       # BB#0:
@@ -2627,10 +2627,10 @@
 ;
 ; HASWELL-LABEL: test_movupd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovupd (%rdi), %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movupd:
 ; BTVER2:       # BB#0:
@@ -2679,8 +2679,8 @@
 ; HASWELL-LABEL: test_mulpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulpd:
 ; BTVER2:       # BB#0:
@@ -2727,8 +2727,8 @@
 ; HASWELL-LABEL: test_mulsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulsd:
 ; BTVER2:       # BB#0:
@@ -2779,9 +2779,9 @@
 ; HASWELL-LABEL: test_orpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_orpd:
 ; BTVER2:       # BB#0:
@@ -2839,8 +2839,8 @@
 ; HASWELL-LABEL: test_packssdw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_packssdw:
 ; BTVER2:       # BB#0:
@@ -2893,8 +2893,8 @@
 ; HASWELL-LABEL: test_packsswb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_packsswb:
 ; BTVER2:       # BB#0:
@@ -2947,8 +2947,8 @@
 ; HASWELL-LABEL: test_packuswb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_packuswb:
 ; BTVER2:       # BB#0:
@@ -3001,8 +3001,8 @@
 ; HASWELL-LABEL: test_paddb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddb:
 ; BTVER2:       # BB#0:
@@ -3053,8 +3053,8 @@
 ; HASWELL-LABEL: test_paddd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddd:
 ; BTVER2:       # BB#0:
@@ -3101,8 +3101,8 @@
 ; HASWELL-LABEL: test_paddq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddq:
 ; BTVER2:       # BB#0:
@@ -3153,8 +3153,8 @@
 ; HASWELL-LABEL: test_paddsb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddsb:
 ; BTVER2:       # BB#0:
@@ -3206,8 +3206,8 @@
 ; HASWELL-LABEL: test_paddsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddsw:
 ; BTVER2:       # BB#0:
@@ -3259,8 +3259,8 @@
 ; HASWELL-LABEL: test_paddusb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddusb:
 ; BTVER2:       # BB#0:
@@ -3312,8 +3312,8 @@
 ; HASWELL-LABEL: test_paddusw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddusw:
 ; BTVER2:       # BB#0:
@@ -3365,8 +3365,8 @@
 ; HASWELL-LABEL: test_paddw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddw:
 ; BTVER2:       # BB#0:
@@ -3417,9 +3417,9 @@
 ; HASWELL-LABEL: test_pand:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pand:
 ; BTVER2:       # BB#0:
@@ -3479,9 +3479,9 @@
 ; HASWELL-LABEL: test_pandn:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pandn:
 ; BTVER2:       # BB#0:
@@ -3537,8 +3537,8 @@
 ; HASWELL-LABEL: test_pavgb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pavgb:
 ; BTVER2:       # BB#0:
@@ -3590,8 +3590,8 @@
 ; HASWELL-LABEL: test_pavgw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pavgw:
 ; BTVER2:       # BB#0:
@@ -3645,9 +3645,9 @@
 ; HASWELL-LABEL: test_pcmpeqb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqb:
 ; BTVER2:       # BB#0:
@@ -3704,9 +3704,9 @@
 ; HASWELL-LABEL: test_pcmpeqd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqd:
 ; BTVER2:       # BB#0:
@@ -3763,9 +3763,9 @@
 ; HASWELL-LABEL: test_pcmpeqw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqw:
 ; BTVER2:       # BB#0:
@@ -3823,9 +3823,9 @@
 ; HASWELL-LABEL: test_pcmpgtb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtb:
 ; BTVER2:       # BB#0:
@@ -3883,9 +3883,9 @@
 ; HASWELL-LABEL: test_pcmpgtd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtd:
 ; BTVER2:       # BB#0:
@@ -3943,9 +3943,9 @@
 ; HASWELL-LABEL: test_pcmpgtw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtw:
 ; BTVER2:       # BB#0:
@@ -3995,9 +3995,9 @@
 ;
 ; HASWELL-LABEL: test_pextrw:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpextrw $6, %xmm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT:    vpextrw $6, %xmm0, %eax # sched: [2:1.00]
 ; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pextrw:
 ; BTVER2:       # BB#0:
@@ -4045,9 +4045,9 @@
 ;
 ; HASWELL-LABEL: test_pinsrw:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrw:
 ; BTVER2:       # BB#0:
@@ -4102,8 +4102,8 @@
 ; HASWELL-LABEL: test_pmaddwd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaddwd:
 ; BTVER2:       # BB#0:
@@ -4156,8 +4156,8 @@
 ; HASWELL-LABEL: test_pmaxsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxsw:
 ; BTVER2:       # BB#0:
@@ -4209,8 +4209,8 @@
 ; HASWELL-LABEL: test_pmaxub:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxub:
 ; BTVER2:       # BB#0:
@@ -4262,8 +4262,8 @@
 ; HASWELL-LABEL: test_pminsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pminsw:
 ; BTVER2:       # BB#0:
@@ -4315,8 +4315,8 @@
 ; HASWELL-LABEL: test_pminub:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pminub:
 ; BTVER2:       # BB#0:
@@ -4362,7 +4362,7 @@
 ; HASWELL-LABEL: test_pmovmskb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovmskb %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovmskb:
 ; BTVER2:       # BB#0:
@@ -4406,8 +4406,8 @@
 ; HASWELL-LABEL: test_pmulhuw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhuw:
 ; BTVER2:       # BB#0:
@@ -4455,8 +4455,8 @@
 ; HASWELL-LABEL: test_pmulhw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhw:
 ; BTVER2:       # BB#0:
@@ -4504,8 +4504,8 @@
 ; HASWELL-LABEL: test_pmullw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmullw:
 ; BTVER2:       # BB#0:
@@ -4560,8 +4560,8 @@
 ; HASWELL-LABEL: test_pmuludq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmuludq:
 ; BTVER2:       # BB#0:
@@ -4614,9 +4614,9 @@
 ; HASWELL-LABEL: test_por:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_por:
 ; BTVER2:       # BB#0:
@@ -4674,8 +4674,8 @@
 ; HASWELL-LABEL: test_psadbw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psadbw:
 ; BTVER2:       # BB#0:
@@ -4730,9 +4730,9 @@
 ; HASWELL-LABEL: test_pshufd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
-; HASWELL-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pshufd:
 ; BTVER2:       # BB#0:
@@ -4788,9 +4788,9 @@
 ; HASWELL-LABEL: test_pshufhw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
-; HASWELL-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pshufhw:
 ; BTVER2:       # BB#0:
@@ -4846,9 +4846,9 @@
 ; HASWELL-LABEL: test_pshuflw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
-; HASWELL-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:1.00]
+; HASWELL-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pshuflw:
 ; BTVER2:       # BB#0:
@@ -4902,9 +4902,9 @@
 ; HASWELL-LABEL: test_pslld:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pslld:
 ; BTVER2:       # BB#0:
@@ -4958,7 +4958,7 @@
 ; HASWELL-LABEL: test_pslldq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pslldq:
 ; BTVER2:       # BB#0:
@@ -5005,9 +5005,9 @@
 ; HASWELL-LABEL: test_psllq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psllq:
 ; BTVER2:       # BB#0:
@@ -5063,9 +5063,9 @@
 ; HASWELL-LABEL: test_psllw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psllw:
 ; BTVER2:       # BB#0:
@@ -5121,9 +5121,9 @@
 ; HASWELL-LABEL: test_psrad:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrad:
 ; BTVER2:       # BB#0:
@@ -5179,9 +5179,9 @@
 ; HASWELL-LABEL: test_psraw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psraw:
 ; BTVER2:       # BB#0:
@@ -5237,9 +5237,9 @@
 ; HASWELL-LABEL: test_psrld:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrld:
 ; BTVER2:       # BB#0:
@@ -5293,7 +5293,7 @@
 ; HASWELL-LABEL: test_psrldq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrldq:
 ; BTVER2:       # BB#0:
@@ -5340,9 +5340,9 @@
 ; HASWELL-LABEL: test_psrlq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrlq:
 ; BTVER2:       # BB#0:
@@ -5398,9 +5398,9 @@
 ; HASWELL-LABEL: test_psrlw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrlw:
 ; BTVER2:       # BB#0:
@@ -5456,8 +5456,8 @@
 ; HASWELL-LABEL: test_psubb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubb:
 ; BTVER2:       # BB#0:
@@ -5508,8 +5508,8 @@
 ; HASWELL-LABEL: test_psubd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubd:
 ; BTVER2:       # BB#0:
@@ -5556,8 +5556,8 @@
 ; HASWELL-LABEL: test_psubq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubq:
 ; BTVER2:       # BB#0:
@@ -5608,8 +5608,8 @@
 ; HASWELL-LABEL: test_psubsb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubsb:
 ; BTVER2:       # BB#0:
@@ -5661,8 +5661,8 @@
 ; HASWELL-LABEL: test_psubsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubsw:
 ; BTVER2:       # BB#0:
@@ -5714,8 +5714,8 @@
 ; HASWELL-LABEL: test_psubusb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubusb:
 ; BTVER2:       # BB#0:
@@ -5767,8 +5767,8 @@
 ; HASWELL-LABEL: test_psubusw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubusw:
 ; BTVER2:       # BB#0:
@@ -5820,8 +5820,8 @@
 ; HASWELL-LABEL: test_psubw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubw:
 ; BTVER2:       # BB#0:
@@ -5872,8 +5872,8 @@
 ; HASWELL-LABEL: test_punpckhbw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhbw:
 ; BTVER2:       # BB#0:
@@ -5926,9 +5926,9 @@
 ; HASWELL-LABEL: test_punpckhdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhdq:
 ; BTVER2:       # BB#0:
@@ -5982,9 +5982,9 @@
 ; HASWELL-LABEL: test_punpckhqdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhqdq:
 ; BTVER2:       # BB#0:
@@ -6038,8 +6038,8 @@
 ; HASWELL-LABEL: test_punpckhwd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhwd:
 ; BTVER2:       # BB#0:
@@ -6090,8 +6090,8 @@
 ; HASWELL-LABEL: test_punpcklbw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklbw:
 ; BTVER2:       # BB#0:
@@ -6144,9 +6144,9 @@
 ; HASWELL-LABEL: test_punpckldq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckldq:
 ; BTVER2:       # BB#0:
@@ -6200,9 +6200,9 @@
 ; HASWELL-LABEL: test_punpcklqdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklqdq:
 ; BTVER2:       # BB#0:
@@ -6256,8 +6256,8 @@
 ; HASWELL-LABEL: test_punpcklwd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklwd:
 ; BTVER2:       # BB#0:
@@ -6308,9 +6308,9 @@
 ; HASWELL-LABEL: test_pxor:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pxor:
 ; BTVER2:       # BB#0:
@@ -6364,9 +6364,9 @@
 ; HASWELL-LABEL: test_shufpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_shufpd:
 ; BTVER2:       # BB#0:
@@ -6420,10 +6420,10 @@
 ;
 ; HASWELL-LABEL: test_sqrtpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [21:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtpd:
 ; BTVER2:       # BB#0:
@@ -6483,11 +6483,11 @@
 ;
 ; HASWELL-LABEL: test_sqrtsd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT:    vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT:    vmovapd (%rdi), %xmm1 # sched: [1:0.50]
+; HASWELL-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtsd:
 ; BTVER2:       # BB#0:
@@ -6540,8 +6540,8 @@
 ; HASWELL-LABEL: test_subpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subpd:
 ; BTVER2:       # BB#0:
@@ -6588,8 +6588,8 @@
 ; HASWELL-LABEL: test_subsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subsd:
 ; BTVER2:       # BB#0:
@@ -6677,7 +6677,7 @@
 ; HASWELL-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ucomisd:
 ; BTVER2:       # BB#0:
@@ -6746,9 +6746,9 @@
 ; HASWELL-LABEL: test_unpckhpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhpd:
 ; BTVER2:       # BB#0:
@@ -6808,9 +6808,9 @@
 ; HASWELL-LABEL: test_unpcklpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklpd:
 ; BTVER2:       # BB#0:
@@ -6864,9 +6864,9 @@
 ; HASWELL-LABEL: test_xorpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_xorpd:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse3-schedule.ll b/llvm/test/CodeGen/X86/sse3-schedule.ll
index f346d4f..ee15f0e 100644
--- a/llvm/test/CodeGen/X86/sse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse3-schedule.ll
@@ -37,8 +37,8 @@
 ; HASWELL-LABEL: test_addsubpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addsubpd:
 ; BTVER2:       # BB#0:
@@ -86,8 +86,8 @@
 ; HASWELL-LABEL: test_addsubps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addsubps:
 ; BTVER2:       # BB#0:
@@ -135,8 +135,8 @@
 ; HASWELL-LABEL: test_haddpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # BB#0:
@@ -184,8 +184,8 @@
 ; HASWELL-LABEL: test_haddps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # BB#0:
@@ -233,8 +233,8 @@
 ; HASWELL-LABEL: test_hsubpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # BB#0:
@@ -282,8 +282,8 @@
 ; HASWELL-LABEL: test_hsubps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # BB#0:
@@ -328,8 +328,8 @@
 ;
 ; HASWELL-LABEL: test_lddqu:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vlddqu (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vlddqu (%rdi), %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lddqu:
 ; BTVER2:       # BB#0:
@@ -379,7 +379,7 @@
 ; HASWELL-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    movl %esi, %ecx # sched: [1:0.25]
 ; HASWELL-NEXT:    monitor # sched: [100:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_monitor:
 ; BTVER2:       # BB#0:
@@ -432,9 +432,9 @@
 ; HASWELL-LABEL: test_movddup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; HASWELL-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
+; HASWELL-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [1:0.50]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movddup:
 ; BTVER2:       # BB#0:
@@ -489,9 +489,9 @@
 ; HASWELL-LABEL: test_movshdup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; HASWELL-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; HASWELL-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movshdup:
 ; BTVER2:       # BB#0:
@@ -546,9 +546,9 @@
 ; HASWELL-LABEL: test_movsldup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movsldup:
 ; BTVER2:       # BB#0:
@@ -603,8 +603,8 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; HASWELL-NEXT:    movl %esi, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    mwait # sched: [100:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    mwait # sched: [20:2.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mwait:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse41-schedule.ll b/llvm/test/CodeGen/X86/sse41-schedule.ll
index 31bf4dd..117fae1 100644
--- a/llvm/test/CodeGen/X86/sse41-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse41-schedule.ll
@@ -34,8 +34,8 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendpd:
 ; BTVER2:       # BB#0:
@@ -79,8 +79,8 @@
 ; HASWELL-LABEL: test_blendps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
-; HASWELL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendps:
 ; BTVER2:       # BB#0:
@@ -127,8 +127,8 @@
 ; HASWELL-LABEL: test_blendvpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendvpd:
 ; BTVER2:       # BB#0:
@@ -176,8 +176,8 @@
 ; HASWELL-LABEL: test_blendvps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendvps:
 ; BTVER2:       # BB#0:
@@ -219,8 +219,8 @@
 ; HASWELL-LABEL: test_dppd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_dppd:
 ; BTVER2:       # BB#0:
@@ -262,8 +262,8 @@
 ; HASWELL-LABEL: test_dpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [14:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_dpps:
 ; BTVER2:       # BB#0:
@@ -305,8 +305,8 @@
 ; HASWELL-LABEL: test_insertps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; HASWELL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_insertps:
 ; BTVER2:       # BB#0:
@@ -344,8 +344,8 @@
 ;
 ; HASWELL-LABEL: test_movntdqa:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntdqa:
 ; BTVER2:       # BB#0:
@@ -382,9 +382,9 @@
 ;
 ; HASWELL-LABEL: test_mpsadbw:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:2.00]
+; HASWELL-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mpsadbw:
 ; BTVER2:       # BB#0:
@@ -427,8 +427,8 @@
 ; HASWELL-LABEL: test_packusdw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_packusdw:
 ; BTVER2:       # BB#0:
@@ -477,8 +477,8 @@
 ; HASWELL-LABEL: test_pblendvb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pblendvb:
 ; BTVER2:       # BB#0:
@@ -521,7 +521,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
 ; HASWELL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pblendw:
 ; BTVER2:       # BB#0:
@@ -562,8 +562,8 @@
 ; HASWELL-LABEL: test_pcmpeqq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqq:
 ; BTVER2:       # BB#0:
@@ -605,9 +605,9 @@
 ;
 ; HASWELL-LABEL: test_pextrb:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpextrb $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpextrb $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pextrb:
 ; BTVER2:       # BB#0:
@@ -648,9 +648,9 @@
 ;
 ; HASWELL-LABEL: test_pextrd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpextrd $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpextrd $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pextrd:
 ; BTVER2:       # BB#0:
@@ -690,9 +690,9 @@
 ;
 ; HASWELL-LABEL: test_pextrq:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpextrq $1, %xmm0, %rax # sched: [1:1.00]
-; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpextrq $1, %xmm0, %rax # sched: [2:1.00]
+; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pextrq:
 ; BTVER2:       # BB#0:
@@ -732,9 +732,9 @@
 ;
 ; HASWELL-LABEL: test_pextrw:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpextrw $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpextrw $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pextrw:
 ; BTVER2:       # BB#0:
@@ -775,9 +775,9 @@
 ;
 ; HASWELL-LABEL: test_phminposuw:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [5:1.00]
 ; HASWELL-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_phminposuw:
 ; BTVER2:       # BB#0:
@@ -818,9 +818,9 @@
 ;
 ; HASWELL-LABEL: test_pinsrb:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrb:
 ; BTVER2:       # BB#0:
@@ -860,9 +860,9 @@
 ;
 ; HASWELL-LABEL: test_pinsrd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrd:
 ; BTVER2:       # BB#0:
@@ -905,10 +905,10 @@
 ;
 ; HASWELL-LABEL: test_pinsrq:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrq:
 ; BTVER2:       # BB#0:
@@ -952,8 +952,8 @@
 ; HASWELL-LABEL: test_pmaxsb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxsb:
 ; BTVER2:       # BB#0:
@@ -995,8 +995,8 @@
 ; HASWELL-LABEL: test_pmaxsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxsd:
 ; BTVER2:       # BB#0:
@@ -1038,8 +1038,8 @@
 ; HASWELL-LABEL: test_pmaxud:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxud:
 ; BTVER2:       # BB#0:
@@ -1081,8 +1081,8 @@
 ; HASWELL-LABEL: test_pmaxuw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxuw:
 ; BTVER2:       # BB#0:
@@ -1124,8 +1124,8 @@
 ; HASWELL-LABEL: test_pminsb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pminsb:
 ; BTVER2:       # BB#0:
@@ -1167,8 +1167,8 @@
 ; HASWELL-LABEL: test_pminsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pminsd:
 ; BTVER2:       # BB#0:
@@ -1210,8 +1210,8 @@
 ; HASWELL-LABEL: test_pminud:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pminud:
 ; BTVER2:       # BB#0:
@@ -1253,8 +1253,8 @@
 ; HASWELL-LABEL: test_pminuw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pminuw:
 ; BTVER2:       # BB#0:
@@ -1300,9 +1300,9 @@
 ; HASWELL-LABEL: test_pmovsxbw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxbw:
 ; BTVER2:       # BB#0:
@@ -1351,9 +1351,9 @@
 ; HASWELL-LABEL: test_pmovsxbd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxbd:
 ; BTVER2:       # BB#0:
@@ -1402,9 +1402,9 @@
 ; HASWELL-LABEL: test_pmovsxbq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxbq:
 ; BTVER2:       # BB#0:
@@ -1453,9 +1453,9 @@
 ; HASWELL-LABEL: test_pmovsxdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxdq:
 ; BTVER2:       # BB#0:
@@ -1504,9 +1504,9 @@
 ; HASWELL-LABEL: test_pmovsxwd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxwd:
 ; BTVER2:       # BB#0:
@@ -1555,9 +1555,9 @@
 ; HASWELL-LABEL: test_pmovsxwq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxwq:
 ; BTVER2:       # BB#0:
@@ -1606,9 +1606,9 @@
 ; HASWELL-LABEL: test_pmovzxbw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
-; HASWELL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; HASWELL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxbw:
 ; BTVER2:       # BB#0:
@@ -1657,9 +1657,9 @@
 ; HASWELL-LABEL: test_pmovzxbd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxbd:
 ; BTVER2:       # BB#0:
@@ -1708,9 +1708,9 @@
 ; HASWELL-LABEL: test_pmovzxbq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxbq:
 ; BTVER2:       # BB#0:
@@ -1759,9 +1759,9 @@
 ; HASWELL-LABEL: test_pmovzxdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
-; HASWELL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:1.00]
+; HASWELL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxdq:
 ; BTVER2:       # BB#0:
@@ -1810,9 +1810,9 @@
 ; HASWELL-LABEL: test_pmovzxwd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
-; HASWELL-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
+; HASWELL-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxwd:
 ; BTVER2:       # BB#0:
@@ -1861,9 +1861,9 @@
 ; HASWELL-LABEL: test_pmovzxwq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxwq:
 ; BTVER2:       # BB#0:
@@ -1908,8 +1908,8 @@
 ; HASWELL-LABEL: test_pmuldq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmuldq:
 ; BTVER2:       # BB#0:
@@ -1953,7 +1953,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00]
 ; HASWELL-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmulld:
 ; BTVER2:       # BB#0:
@@ -2011,7 +2011,7 @@
 ; HASWELL-NEXT:    setb %cl # sched: [1:0.50]
 ; HASWELL-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %cl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ptest:
 ; BTVER2:       # BB#0:
@@ -2065,10 +2065,10 @@
 ;
 ; HASWELL-LABEL: test_roundpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [5:1.25]
+; HASWELL-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [6:2.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_roundpd:
 ; BTVER2:       # BB#0:
@@ -2116,10 +2116,10 @@
 ;
 ; HASWELL-LABEL: test_roundps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [5:1.25]
+; HASWELL-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [6:2.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_roundps:
 ; BTVER2:       # BB#0:
@@ -2168,10 +2168,10 @@
 ;
 ; HASWELL-LABEL: test_roundsd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
-; HASWELL-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [5:1.25]
+; HASWELL-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_roundsd:
 ; BTVER2:       # BB#0:
@@ -2220,10 +2220,10 @@
 ;
 ; HASWELL-LABEL: test_roundss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
-; HASWELL-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [5:1.25]
+; HASWELL-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
 ; HASWELL-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_roundss:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse42-schedule.ll b/llvm/test/CodeGen/X86/sse42-schedule.ll
index d174ac3..d25be4b 100644
--- a/llvm/test/CodeGen/X86/sse42-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse42-schedule.ll
@@ -35,7 +35,7 @@
 ; HASWELL-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
 ; HASWELL-NEXT:    crc32b (%rdx), %edi # sched: [7:1.00]
 ; HASWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_8:
 ; BTVER2:       # BB#0:
@@ -84,7 +84,7 @@
 ; HASWELL-NEXT:    crc32w %si, %edi # sched: [3:1.00]
 ; HASWELL-NEXT:    crc32w (%rdx), %edi # sched: [7:1.00]
 ; HASWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_16:
 ; BTVER2:       # BB#0:
@@ -133,7 +133,7 @@
 ; HASWELL-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
 ; HASWELL-NEXT:    crc32l (%rdx), %edi # sched: [7:1.00]
 ; HASWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_32:
 ; BTVER2:       # BB#0:
@@ -182,7 +182,7 @@
 ; HASWELL-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
 ; HASWELL-NEXT:    crc32b (%rdx), %edi # sched: [7:1.00]
 ; HASWELL-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: crc32_64_8:
 ; BTVER2:       # BB#0:
@@ -231,7 +231,7 @@
 ; HASWELL-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
 ; HASWELL-NEXT:    crc32q (%rdx), %rdi # sched: [7:1.00]
 ; HASWELL-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: crc32_64_64:
 ; BTVER2:       # BB#0:
@@ -297,14 +297,14 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    movl $7, %eax # sched: [1:0.25]
 ; HASWELL-NEXT:    movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT:    vpcmpestri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT:    vpcmpestri $7, %xmm1, %xmm0 # sched: [18:4.00]
 ; HASWELL-NEXT:    movl %ecx, %esi # sched: [1:0.25]
 ; HASWELL-NEXT:    movl $7, %eax # sched: [1:0.25]
 ; HASWELL-NEXT:    movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [18:4.00]
 ; HASWELL-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; HASWELL-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpestri:
 ; BTVER2:       # BB#0:
@@ -374,11 +374,11 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    movl $7, %eax # sched: [1:0.25]
 ; HASWELL-NEXT:    movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; HASWELL-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00]
 ; HASWELL-NEXT:    movl $7, %eax # sched: [1:0.25]
 ; HASWELL-NEXT:    movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpestrm:
 ; BTVER2:       # BB#0:
@@ -441,7 +441,7 @@
 ; HASWELL-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [11:3.00]
 ; HASWELL-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; HASWELL-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpistri:
 ; BTVER2:       # BB#0:
@@ -489,9 +489,9 @@
 ;
 ; HASWELL-LABEL: test_pcmpistrm:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpistrm:
 ; BTVER2:       # BB#0:
@@ -534,7 +534,7 @@
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
 ; HASWELL-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtq:
 ; BTVER2:       # BB#0:
@@ -576,9 +576,9 @@
 ;
 ; HASWELL-LABEL: test_pclmulqdq:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [7:2.00]
-; HASWELL-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pclmulqdq:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/ssse3-schedule.ll b/llvm/test/CodeGen/X86/ssse3-schedule.ll
index 86ce4c7..acbc2cf 100644
--- a/llvm/test/CodeGen/X86/ssse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/ssse3-schedule.ll
@@ -42,9 +42,9 @@
 ; HASWELL-LABEL: test_pabsb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpabsb (%rdi), %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pabsb:
 ; BTVER2:       # BB#0:
@@ -100,9 +100,9 @@
 ; HASWELL-LABEL: test_pabsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpabsd (%rdi), %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pabsd:
 ; BTVER2:       # BB#0:
@@ -158,9 +158,9 @@
 ; HASWELL-LABEL: test_pabsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpabsw (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpabsw (%rdi), %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pabsw:
 ; BTVER2:       # BB#0:
@@ -216,8 +216,8 @@
 ; HASWELL-LABEL: test_palignr:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
-; HASWELL-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_palignr:
 ; BTVER2:       # BB#0:
@@ -264,8 +264,8 @@
 ; HASWELL-LABEL: test_phaddd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_phaddd:
 ; BTVER2:       # BB#0:
@@ -313,8 +313,8 @@
 ; HASWELL-LABEL: test_phaddsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_phaddsw:
 ; BTVER2:       # BB#0:
@@ -362,8 +362,8 @@
 ; HASWELL-LABEL: test_phaddw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_phaddw:
 ; BTVER2:       # BB#0:
@@ -411,8 +411,8 @@
 ; HASWELL-LABEL: test_phsubd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_phsubd:
 ; BTVER2:       # BB#0:
@@ -460,8 +460,8 @@
 ; HASWELL-LABEL: test_phsubsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_phsubsw:
 ; BTVER2:       # BB#0:
@@ -509,8 +509,8 @@
 ; HASWELL-LABEL: test_phsubw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_phsubw:
 ; BTVER2:       # BB#0:
@@ -558,8 +558,8 @@
 ; HASWELL-LABEL: test_pmaddubsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaddubsw:
 ; BTVER2:       # BB#0:
@@ -608,8 +608,8 @@
 ; HASWELL-LABEL: test_pmulhrsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhrsw:
 ; BTVER2:       # BB#0:
@@ -657,8 +657,8 @@
 ; HASWELL-LABEL: test_pshufb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pshufb:
 ; BTVER2:       # BB#0:
@@ -710,8 +710,8 @@
 ; HASWELL-LABEL: test_psignb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psignb:
 ; BTVER2:       # BB#0:
@@ -763,8 +763,8 @@
 ; HASWELL-LABEL: test_psignd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psignd:
 ; BTVER2:       # BB#0:
@@ -816,8 +816,8 @@
 ; HASWELL-LABEL: test_psignw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psignw:
 ; BTVER2:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index b69715c..edbf7df 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -201,14 +201,14 @@
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm5
 ; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm8
 ; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm5, %ymm9
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm9, %ymm5, %ymm3
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm5
 ; AVX512DQ-NEXT:    vpaddw %ymm8, %ymm8, %ymm9
 ; AVX512DQ-NEXT:    vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
 ; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
@@ -328,14 +328,14 @@
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm5
 ; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm8
 ; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm5, %ymm9
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm9, %ymm5, %ymm3
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm5
 ; AVX512DQ-NEXT:    vpaddw %ymm8, %ymm8, %ymm9
 ; AVX512DQ-NEXT:    vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
 ; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index 72d1def..9b3511e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -777,9 +777,9 @@
 ;
 ; AVX512DQ-LABEL: splatvar_shift_v32i8:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 02681b7..b7d284d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -713,9 +713,9 @@
 ;
 ; AVX512DQ-LABEL: splatvar_shift_v32i8:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 5980d8b..a668fd5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -68,13 +68,13 @@
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1]
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
 ; KNL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7]
-; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm1
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; KNL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7]
-; KNL-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
-; KNL-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; KNL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; KNL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm5
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7],ymm0[8],ymm5[9],ymm0[10],ymm5[11],ymm0[12],ymm5[13],ymm0[14],ymm5[15]
 ; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm3
 ; KNL-NEXT:    vpbroadcastw %xmm3, %ymm3
 ; KNL-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]