[X86][XOP] Add support for lowering vector rotations
This patch adds support for lowering to the XOP VPROT / VPROTI vector bit rotation instructions.
This has required changes to the DAGCombiner rotation pattern matching to support vector types - so far I've only changed it to support splat vectors, but generalising this further is feasible in the future.
Differential Revision: http://reviews.llvm.org/D13851
llvm-svn: 251188
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 844e748..56d2354 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -47,30 +47,20 @@
;
; XOPAVX1-LABEL: var_rotate_v4i64:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vpshlq %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v4i64:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
%shl = shl <4 x i64> %a, %b
@@ -135,30 +125,20 @@
;
; XOPAVX1-LABEL: var_rotate_v8i32:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vpshld %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v8i32:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
%shl = shl <8 x i32> %a, %b
@@ -262,40 +242,20 @@
;
; XOPAVX1-LABEL: var_rotate_v16i16:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpsubw %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpshlw %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlw %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vpshlw %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v16i16:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; XOPAVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT: vpsubw %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm2
-; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
%shl = shl <16 x i16> %a, %b
@@ -400,40 +360,20 @@
;
; XOPAVX1-LABEL: var_rotate_v32i8:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v32i8:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT: vpsubb %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm2
-; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
%shl = shl <32 x i8> %a, %b
@@ -789,21 +729,18 @@
;
; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpsllq $14, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrlq $50, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlq $50, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllq $14, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
%lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
@@ -833,21 +770,18 @@
;
; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpslld $4, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrld $28, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrld $28, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
@@ -877,21 +811,18 @@
;
; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpsllw $7, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrlw $9, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlw $9, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllw $7, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlw $9, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
@@ -929,26 +860,18 @@
;
; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm3
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; XOPAVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
@@ -989,28 +912,22 @@
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpsllq $15, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpsllq $15, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlq $49, %xmm2, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vprotq $15, %xmm1, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllq $15, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
+; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
%lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
@@ -1048,27 +965,21 @@
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpslld $4, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrld $28, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrld $28, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
@@ -1104,25 +1015,20 @@
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vpsllw $5, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpsllw $5, %xmm2, %xmm3
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT: vpsrlw $11, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlw $11, %xmm2, %xmm2
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllw $5, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpsrlw $11, %ymm0, %ymm0
+; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
%lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
@@ -1166,30 +1072,20 @@
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
; XOPAVX1: # BB#0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm3
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm1
-; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
; XOPAVX2: # BB#0:
-; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm1
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>