[X86][SSE] Use (V)PHMINPOSUW for vXi16 SMAX/SMIN/UMAX/UMIN horizontal reductions (PR32841)

(V)PHMINPOSUW determines the UMIN element in an v8i16 input, with suitable bit flipping it can also be used for SMAX/SMIN/UMAX cases as well.

This patch matches vXi16 SMAX/SMIN/UMAX/UMIN horizontal reductions and reduces the input down to a v8i16 vector before calling (V)PHMINPOSUW.

A later patch will use this for v16i8 reductions as well (PR32841).

Differential Revision: https://reviews.llvm.org/D39729

llvm-svn: 318917
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
index 8f5aac4..956a9b1 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -196,52 +196,68 @@
 }
 
 define i16 @test_reduce_v8i16(<8 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v8i16:
-; X86-SSE:       ## BB#0:
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrld $16, %xmm1
-; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
-; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT:    retl
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v8i16:
 ; X86-AVX:       ## BB#0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX-NEXT:    retl
 ;
-; X64-SSE-LABEL: test_reduce_v8i16:
-; X64-SSE:       ## BB#0:
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE-NEXT:    psrld $16, %xmm1
-; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT:    retq
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v8i16:
 ; X64-AVX:       ## BB#0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX-NEXT:    retq
@@ -719,30 +735,39 @@
 }
 
 define i16 @test_reduce_v16i16(<16 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v16i16:
-; X86-SSE:       ## BB#0:
-; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrld $16, %xmm1
-; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
-; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT:    retl
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v16i16:
 ; X86-AVX1:       ## BB#0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX1-NEXT:    vzeroupper
@@ -751,42 +776,49 @@
 ; X86-AVX2-LABEL: test_reduce_v16i16:
 ; X86-AVX2:       ## BB#0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
-; X64-SSE-LABEL: test_reduce_v16i16:
-; X64-SSE:       ## BB#0:
-; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE-NEXT:    psrld $16, %xmm1
-; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT:    retq
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v16i16:
 ; X64-AVX1:       ## BB#0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX1-NEXT:    vzeroupper
@@ -795,13 +827,11 @@
 ; X64-AVX2-LABEL: test_reduce_v16i16:
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX2-NEXT:    vzeroupper
@@ -810,13 +840,11 @@
 ; X64-AVX512-LABEL: test_reduce_v16i16:
 ; X64-AVX512:       ## BB#0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1511,21 +1539,34 @@
 }
 
 define i16 @test_reduce_v32i16(<32 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v32i16:
-; X86-SSE:       ## BB#0:
-; X86-SSE-NEXT:    pmaxsw %xmm3, %xmm1
-; X86-SSE-NEXT:    pmaxsw %xmm2, %xmm0
-; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrld $16, %xmm1
-; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
-; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT:    retl
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxsw %xmm3, %xmm1
+; X86-SSE42-NEXT:    pmaxsw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v32i16:
 ; X86-AVX1:       ## BB#0:
@@ -1534,12 +1575,10 @@
 ; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1549,33 +1588,44 @@
 ; X86-AVX2:       ## BB#0:
 ; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
-; X64-SSE-LABEL: test_reduce_v32i16:
-; X64-SSE:       ## BB#0:
-; X64-SSE-NEXT:    pmaxsw %xmm3, %xmm1
-; X64-SSE-NEXT:    pmaxsw %xmm2, %xmm0
-; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE-NEXT:    psrld $16, %xmm1
-; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT:    retq
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxsw %xmm3, %xmm1
+; X64-SSE42-NEXT:    pmaxsw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v32i16:
 ; X64-AVX1:       ## BB#0:
@@ -1584,12 +1634,10 @@
 ; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1599,13 +1647,11 @@
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1614,15 +1660,13 @@
 ; X64-AVX512-LABEL: test_reduce_v32i16:
 ; X64-AVX512:       ## BB#0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX512-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
index 6feb963..cee5195 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -198,52 +198,68 @@
 }
 
 define i16 @test_reduce_v8i16(<8 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v8i16:
-; X86-SSE:       ## BB#0:
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrld $16, %xmm1
-; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
-; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT:    retl
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v8i16:
 ; X86-AVX:       ## BB#0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX-NEXT:    retl
 ;
-; X64-SSE-LABEL: test_reduce_v8i16:
-; X64-SSE:       ## BB#0:
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE-NEXT:    psrld $16, %xmm1
-; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT:    retq
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v8i16:
 ; X64-AVX:       ## BB#0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX-NEXT:    retq
@@ -723,30 +739,39 @@
 }
 
 define i16 @test_reduce_v16i16(<16 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v16i16:
-; X86-SSE:       ## BB#0:
-; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrld $16, %xmm1
-; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
-; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT:    retl
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v16i16:
 ; X86-AVX1:       ## BB#0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX1-NEXT:    vzeroupper
@@ -755,42 +780,49 @@
 ; X86-AVX2-LABEL: test_reduce_v16i16:
 ; X86-AVX2:       ## BB#0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
-; X64-SSE-LABEL: test_reduce_v16i16:
-; X64-SSE:       ## BB#0:
-; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE-NEXT:    psrld $16, %xmm1
-; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT:    retq
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v16i16:
 ; X64-AVX1:       ## BB#0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX1-NEXT:    vzeroupper
@@ -799,13 +831,11 @@
 ; X64-AVX2-LABEL: test_reduce_v16i16:
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX2-NEXT:    vzeroupper
@@ -814,13 +844,11 @@
 ; X64-AVX512-LABEL: test_reduce_v16i16:
 ; X64-AVX512:       ## BB#0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1513,21 +1541,34 @@
 }
 
 define i16 @test_reduce_v32i16(<32 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v32i16:
-; X86-SSE:       ## BB#0:
-; X86-SSE-NEXT:    pminsw %xmm3, %xmm1
-; X86-SSE-NEXT:    pminsw %xmm2, %xmm0
-; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrld $16, %xmm1
-; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
-; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT:    retl
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pminsw %xmm3, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm2, %xmm0
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminsw %xmm3, %xmm1
+; X86-SSE42-NEXT:    pminsw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v32i16:
 ; X86-AVX1:       ## BB#0:
@@ -1536,12 +1577,10 @@
 ; X86-AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1551,33 +1590,44 @@
 ; X86-AVX2:       ## BB#0:
 ; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
-; X64-SSE-LABEL: test_reduce_v32i16:
-; X64-SSE:       ## BB#0:
-; X64-SSE-NEXT:    pminsw %xmm3, %xmm1
-; X64-SSE-NEXT:    pminsw %xmm2, %xmm0
-; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE-NEXT:    psrld $16, %xmm1
-; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT:    retq
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pminsw %xmm3, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm2, %xmm0
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminsw %xmm3, %xmm1
+; X64-SSE42-NEXT:    pminsw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v32i16:
 ; X64-AVX1:       ## BB#0:
@@ -1586,12 +1636,10 @@
 ; X64-AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1601,13 +1649,11 @@
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1616,15 +1662,13 @@
 ; X64-AVX512-LABEL: test_reduce_v32i16:
 ; X64-AVX512:       ## BB#0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX512-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index ee9d895..97235a8 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -259,25 +259,20 @@
 ;
 ; X86-SSE42-LABEL: test_reduce_v8i16:
 ; X86-SSE42:       ## BB#0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v8i16:
 ; X86-AVX:       ## BB#0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX-NEXT:    retl
@@ -318,25 +313,20 @@
 ;
 ; X64-SSE42-LABEL: test_reduce_v8i16:
 ; X64-SSE42:       ## BB#0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v8i16:
 ; X64-AVX:       ## BB#0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX-NEXT:    retq
@@ -879,14 +869,11 @@
 ; X86-SSE42-LABEL: test_reduce_v16i16:
 ; X86-SSE42:       ## BB#0:
 ; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-SSE42-NEXT:    retl
 ;
@@ -894,12 +881,10 @@
 ; X86-AVX1:       ## BB#0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX1-NEXT:    vzeroupper
@@ -908,13 +893,11 @@
 ; X86-AVX2-LABEL: test_reduce_v16i16:
 ; X86-AVX2:       ## BB#0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX2-NEXT:    vzeroupper
@@ -965,14 +948,11 @@
 ; X64-SSE42-LABEL: test_reduce_v16i16:
 ; X64-SSE42:       ## BB#0:
 ; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-SSE42-NEXT:    retq
 ;
@@ -980,12 +960,10 @@
 ; X64-AVX1:       ## BB#0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX1-NEXT:    vzeroupper
@@ -994,13 +972,11 @@
 ; X64-AVX2-LABEL: test_reduce_v16i16:
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1009,13 +985,11 @@
 ; X64-AVX512-LABEL: test_reduce_v16i16:
 ; X64-AVX512:       ## BB#0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1821,14 +1795,11 @@
 ; X86-SSE42-NEXT:    pmaxuw %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pmaxuw %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-SSE42-NEXT:    retl
 ;
@@ -1839,12 +1810,10 @@
 ; X86-AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1854,13 +1823,11 @@
 ; X86-AVX2:       ## BB#0:
 ; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1929,14 +1896,11 @@
 ; X64-SSE42-NEXT:    pmaxuw %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pmaxuw %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-SSE42-NEXT:    retq
 ;
@@ -1947,12 +1911,10 @@
 ; X64-AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1962,13 +1924,11 @@
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1977,15 +1937,13 @@
 ; X64-AVX512-LABEL: test_reduce_v32i16:
 ; X64-AVX512:       ## BB#0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX512-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index 4336967..ba29075 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -261,25 +261,14 @@
 ;
 ; X86-SSE42-LABEL: test_reduce_v8i16:
 ; X86-SSE42:       ## BB#0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v8i16:
 ; X86-AVX:       ## BB#0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX-NEXT:    retl
@@ -320,25 +309,14 @@
 ;
 ; X64-SSE42-LABEL: test_reduce_v8i16:
 ; X64-SSE42:       ## BB#0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v8i16:
 ; X64-AVX:       ## BB#0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX-NEXT:    retq
@@ -885,14 +863,8 @@
 ; X86-SSE42-LABEL: test_reduce_v16i16:
 ; X86-SSE42:       ## BB#0:
 ; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-SSE42-NEXT:    retl
 ;
@@ -900,12 +872,7 @@
 ; X86-AVX1:       ## BB#0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX1-NEXT:    vzeroupper
@@ -914,13 +881,8 @@
 ; X86-AVX2-LABEL: test_reduce_v16i16:
 ; X86-AVX2:       ## BB#0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX2-NEXT:    vzeroupper
@@ -971,14 +933,8 @@
 ; X64-SSE42-LABEL: test_reduce_v16i16:
 ; X64-SSE42:       ## BB#0:
 ; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-SSE42-NEXT:    retq
 ;
@@ -986,12 +942,7 @@
 ; X64-AVX1:       ## BB#0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1000,13 +951,8 @@
 ; X64-AVX2-LABEL: test_reduce_v16i16:
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1015,13 +961,8 @@
 ; X64-AVX512-LABEL: test_reduce_v16i16:
 ; X64-AVX512:       ## BB#0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1825,14 +1766,8 @@
 ; X86-SSE42-NEXT:    pminuw %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pminuw %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-SSE42-NEXT:    retl
 ;
@@ -1843,12 +1778,7 @@
 ; X86-AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1858,13 +1788,8 @@
 ; X86-AVX2:       ## BB#0:
 ; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1933,14 +1858,8 @@
 ; X64-SSE42-NEXT:    pminuw %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pminuw %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-SSE42-NEXT:    retq
 ;
@@ -1951,12 +1870,7 @@
 ; X64-AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1966,13 +1880,8 @@
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1981,15 +1890,10 @@
 ; X64-AVX512-LABEL: test_reduce_v32i16:
 ; X64-AVX512:       ## BB#0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-AVX512-NEXT:    vzeroupper