blob: 56d2354463e79a7abf3619c9b60b42f133e9bd02 [file] [log] [blame]
Simon Pilgrimbb178812015-10-20 20:27:23 +00001; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
5
6;
7; Variable Rotates
8;
9
10define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
11; AVX1-LABEL: var_rotate_v4i64:
12; AVX1: # BB#0:
13; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
14; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
15; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
16; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
17; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
18; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
19; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
20; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
21; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
22; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6
23; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
24; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
25; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
26; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
27; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4
28; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
29; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2
30; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
31; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
32; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
33; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0
34; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
35; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
36; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
37; AVX1-NEXT: retq
38;
39; AVX2-LABEL: var_rotate_v4i64:
40; AVX2: # BB#0:
41; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
42; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
43; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
44; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0
45; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
46; AVX2-NEXT: retq
47;
48; XOPAVX1-LABEL: var_rotate_v4i64:
49; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +000050; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
51; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
52; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
53; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
Simon Pilgrimbb178812015-10-20 20:27:23 +000054; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +000055; XOPAVX1-NEXT: retq
56;
57; XOPAVX2-LABEL: var_rotate_v4i64:
58; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +000059; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
60; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
61; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
62; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
63; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +000064; XOPAVX2-NEXT: retq
65 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
66 %shl = shl <4 x i64> %a, %b
67 %lshr = lshr <4 x i64> %a, %b64
68 %or = or <4 x i64> %shl, %lshr
69 ret <4 x i64> %or
70}
71
72define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
73; AVX1-LABEL: var_rotate_v8i32:
74; AVX1: # BB#0:
75; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
76; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm2
77; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
78; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
79; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
80; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
81; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
82; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
83; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
84; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4
85; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
86; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
87; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
88; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm1
89; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
90; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
91; AVX1-NEXT: vpsrld %xmm4, %xmm6, %xmm4
92; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
93; AVX1-NEXT: vpsrld %xmm5, %xmm6, %xmm5
94; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
95; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
96; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
97; AVX1-NEXT: vpsrld %xmm7, %xmm6, %xmm7
98; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
99; AVX1-NEXT: vpsrld %xmm3, %xmm6, %xmm3
100; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
101; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
102; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
103; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
104; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
105; AVX1-NEXT: vpsrld %xmm6, %xmm0, %xmm6
106; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
107; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
108; AVX1-NEXT: vpsrld %xmm5, %xmm0, %xmm5
109; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
110; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0
111; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
112; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
113; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
114; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
115; AVX1-NEXT: retq
116;
117; AVX2-LABEL: var_rotate_v8i32:
118; AVX2: # BB#0:
119; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
120; AVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2
121; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1
122; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
123; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
124; AVX2-NEXT: retq
125;
126; XOPAVX1-LABEL: var_rotate_v8i32:
127; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000128; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
129; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
130; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
131; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000132; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000133; XOPAVX1-NEXT: retq
134;
135; XOPAVX2-LABEL: var_rotate_v8i32:
136; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000137; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
138; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
139; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
140; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
141; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000142; XOPAVX2-NEXT: retq
143 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
144 %shl = shl <8 x i32> %a, %b
145 %lshr = lshr <8 x i32> %a, %b32
146 %or = or <8 x i32> %shl, %lshr
147 ret <8 x i32> %or
148}
149
150define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
151; AVX1-LABEL: var_rotate_v16i16:
152; AVX1: # BB#0:
153; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
154; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm2
155; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
156; AVX1-NEXT: vpsubw %xmm4, %xmm3, %xmm3
157; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
158; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
159; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm5
160; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm6
161; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
162; AVX1-NEXT: vpsllw $8, %xmm4, %xmm7
163; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm4, %xmm5
164; AVX1-NEXT: vpsllw $4, %xmm5, %xmm7
165; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
166; AVX1-NEXT: vpsllw $2, %xmm5, %xmm7
167; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
168; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
169; AVX1-NEXT: vpsllw $1, %xmm5, %xmm7
170; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
171; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
172; AVX1-NEXT: vpsllw $12, %xmm1, %xmm6
173; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
174; AVX1-NEXT: vpor %xmm6, %xmm1, %xmm1
175; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm6
176; AVX1-NEXT: vpsllw $8, %xmm0, %xmm7
177; AVX1-NEXT: vpblendvb %xmm1, %xmm7, %xmm0, %xmm1
178; AVX1-NEXT: vpsllw $4, %xmm1, %xmm7
179; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
180; AVX1-NEXT: vpsllw $2, %xmm1, %xmm7
181; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
182; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
183; AVX1-NEXT: vpsllw $1, %xmm1, %xmm7
184; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
185; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
186; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
187; AVX1-NEXT: vpsllw $12, %xmm3, %xmm5
188; AVX1-NEXT: vpsllw $4, %xmm3, %xmm3
189; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
190; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm5
191; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm6
192; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm4, %xmm3
193; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4
194; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
195; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4
196; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
197; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
198; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4
199; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
200; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
201; AVX1-NEXT: vpsllw $12, %xmm2, %xmm4
202; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
203; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
204; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm4
205; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm5
206; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm0, %xmm0
207; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
208; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
209; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
210; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
211; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
212; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
213; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
214; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
215; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
216; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
217; AVX1-NEXT: retq
218;
219; AVX2-LABEL: var_rotate_v16i16:
220; AVX2: # BB#0:
221; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
222; AVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2
223; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
224; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
225; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
226; AVX2-NEXT: vpsllvd %ymm4, %ymm5, %ymm4
227; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
228; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
229; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
230; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1
231; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
232; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
233; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
234; AVX2-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4
235; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
236; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
237; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
238; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
239; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0
240; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
241; AVX2-NEXT: retq
242;
243; XOPAVX1-LABEL: var_rotate_v16i16:
244; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000245; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
246; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
247; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
248; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000249; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000250; XOPAVX1-NEXT: retq
251;
252; XOPAVX2-LABEL: var_rotate_v16i16:
253; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000254; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
255; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
256; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
257; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
258; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000259; XOPAVX2-NEXT: retq
260 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
261 %shl = shl <16 x i16> %a, %b
262 %lshr = lshr <16 x i16> %a, %b16
263 %or = or <16 x i16> %shl, %lshr
264 ret <16 x i16> %or
265}
266
267define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
268; AVX1-LABEL: var_rotate_v32i8:
269; AVX1: # BB#0:
270; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
271; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm8
272; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
273; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm9
274; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
275; AVX1-NEXT: vpsllw $4, %xmm5, %xmm6
276; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
277; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
278; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
279; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm5, %xmm6
280; AVX1-NEXT: vpsllw $2, %xmm6, %xmm2
281; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
282; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
283; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
284; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm6, %xmm2
285; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm6
286; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
287; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm2, %xmm2
288; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
289; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
290; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
291; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm4
292; AVX1-NEXT: vpsllw $2, %xmm4, %xmm6
293; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3
294; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
295; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm4, %xmm3
296; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm4
297; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
298; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm3, %xmm1
299; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
300; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm2
301; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
302; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
303; AVX1-NEXT: vpsllw $5, %xmm9, %xmm4
304; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm5, %xmm2
305; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm5
306; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
307; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
308; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
309; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
310; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm5
311; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
312; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
313; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
314; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
315; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
316; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
317; AVX1-NEXT: vpsllw $5, %xmm8, %xmm4
318; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
319; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
320; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
321; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
322; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
323; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
324; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
325; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
326; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
327; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
328; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
329; AVX1-NEXT: retq
330;
331; AVX2-LABEL: var_rotate_v32i8:
332; AVX2: # BB#0:
333; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
334; AVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
335; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
336; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
337; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
338; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm3
339; AVX2-NEXT: vpsllw $2, %ymm3, %ymm4
340; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
341; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
342; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3
343; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm4
344; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
345; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
346; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2
347; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3
348; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
349; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
350; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
351; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
352; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
353; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
354; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
355; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
356; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
357; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
358; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
359; AVX2-NEXT: retq
360;
361; XOPAVX1-LABEL: var_rotate_v32i8:
362; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000363; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
364; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
365; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
366; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000367; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000368; XOPAVX1-NEXT: retq
369;
370; XOPAVX2-LABEL: var_rotate_v32i8:
371; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000372; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
373; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
374; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
375; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
376; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000377; XOPAVX2-NEXT: retq
378 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
379 %shl = shl <32 x i8> %a, %b
380 %lshr = lshr <32 x i8> %a, %b8
381 %or = or <32 x i8> %shl, %lshr
382 ret <32 x i8> %or
383}
384
385;
386; Constant Rotates
387;
388
389define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
390; AVX1-LABEL: constant_rotate_v4i64:
391; AVX1: # BB#0:
392; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
393; AVX1-NEXT: vpsllq $60, %xmm1, %xmm2
394; AVX1-NEXT: vpsllq $50, %xmm1, %xmm3
395; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
396; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
397; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4
398; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
399; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
400; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm3
401; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1
402; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
403; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
404; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0
405; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
406; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
407; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
408; AVX1-NEXT: retq
409;
410; AVX2-LABEL: constant_rotate_v4i64:
411; AVX2: # BB#0:
412; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1
413; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
414; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
415; AVX2-NEXT: retq
416;
417; XOPAVX1-LABEL: constant_rotate_v4i64:
418; XOPAVX1: # BB#0:
419; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1
420; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
421; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm2, %xmm3
422; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
423; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
424; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm4
425; XOPAVX1-NEXT: vpshlq %xmm4, %xmm2, %xmm2
426; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3
427; XOPAVX1-NEXT: vpshlq %xmm3, %xmm0, %xmm0
428; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
429; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
430; XOPAVX1-NEXT: retq
431;
432; XOPAVX2-LABEL: constant_rotate_v4i64:
433; XOPAVX2: # BB#0:
434; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1
435; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
436; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
437; XOPAVX2-NEXT: retq
438 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
439 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 2>
440 %or = or <4 x i64> %shl, %lshr
441 ret <4 x i64> %or
442}
443
444define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
445; AVX1-LABEL: constant_rotate_v8i32:
446; AVX1: # BB#0:
447; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
448; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
449; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm3
450; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
451; AVX1-NEXT: vpsrld $21, %xmm2, %xmm3
452; AVX1-NEXT: vpsrld $23, %xmm2, %xmm4
453; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
454; AVX1-NEXT: vpsrld $22, %xmm2, %xmm4
455; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
456; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
457; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
458; AVX1-NEXT: vpsrld $25, %xmm0, %xmm3
459; AVX1-NEXT: vpsrld $27, %xmm0, %xmm4
460; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
461; AVX1-NEXT: vpsrld $26, %xmm0, %xmm4
462; AVX1-NEXT: vpsrld $28, %xmm0, %xmm0
463; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
464; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
465; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
466; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
467; AVX1-NEXT: retq
468;
469; AVX2-LABEL: constant_rotate_v8i32:
470; AVX2: # BB#0:
471; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1
472; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
473; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
474; AVX2-NEXT: retq
475;
476; XOPAVX1-LABEL: constant_rotate_v8i32:
477; XOPAVX1: # BB#0:
478; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1
479; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
480; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm2, %xmm3
481; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
482; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
483; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm2, %xmm2
484; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
485; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
486; XOPAVX1-NEXT: retq
487;
488; XOPAVX2-LABEL: constant_rotate_v8i32:
489; XOPAVX2: # BB#0:
490; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1
491; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
492; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
493; XOPAVX2-NEXT: retq
494 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
495 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
496 %or = or <8 x i32> %shl, %lshr
497 ret <8 x i32> %or
498}
499
500define <16 x i16> @constant_rotate_v8i16(<16 x i16> %a) nounwind {
501; AVX1-LABEL: constant_rotate_v8i16:
502; AVX1: # BB#0:
503; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
504; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
505; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3
506; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
507; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3
508; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,28784,24672,20560,16448,12336,8224,4112]
509; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
510; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
511; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,57568,49344,41120,32896,24672,16448,8224]
512; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
513; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
514; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,49600,33152,16704,256,49344,32896,16448]
515; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
516; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
517; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,33664,768,33408,512,33152,256,32896]
518; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
519; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
520; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,61680,57568,53456,49344,45232,41120,37008]
521; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
522; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
523; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,57824,49600,41376,33152,24928,16704,8480]
524; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
525; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
526; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,50112,33664,17216,768,49856,33408,16960]
527; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
528; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
529; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2048,34688,1792,34432,1536,34176,1280,33920]
530; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
531; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
532; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
533; AVX1-NEXT: retq
534;
535; AVX2-LABEL: constant_rotate_v8i16:
536; AVX2: # BB#0:
537; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1
538; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
539; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
540; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
541; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
542; AVX2-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4
543; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
544; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
545; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
546; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
547; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
548; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0
549; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
550; AVX2-NEXT: retq
551;
552; XOPAVX1-LABEL: constant_rotate_v8i16:
553; XOPAVX1: # BB#0:
554; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1
555; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
556; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm2, %xmm3
557; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
558; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
559; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm3, %xmm4
560; XOPAVX1-NEXT: vpshlw %xmm4, %xmm2, %xmm2
561; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm3, %xmm3
562; XOPAVX1-NEXT: vpshlw %xmm3, %xmm0, %xmm0
563; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
564; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
565; XOPAVX1-NEXT: retq
566;
567; XOPAVX2-LABEL: constant_rotate_v8i16:
568; XOPAVX2: # BB#0:
569; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1
570; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
571; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm3
572; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
573; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
574; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm2
575; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
576; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
577; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
578; XOPAVX2-NEXT: retq
579 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
580 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
581 %or = or <16 x i16> %shl, %lshr
582 ret <16 x i16> %or
583}
584
585define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
586; AVX1-LABEL: constant_rotate_v32i8:
587; AVX1: # BB#0:
588; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
589; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
590; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
591; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
592; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
593; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
594; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm2
595; AVX1-NEXT: vpsllw $2, %xmm2, %xmm5
596; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
597; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
598; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm7
599; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm2, %xmm2
600; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm5
601; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm3
602; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm2, %xmm2
603; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
604; AVX1-NEXT: vpand %xmm8, %xmm5, %xmm5
605; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm0, %xmm4
606; AVX1-NEXT: vpsllw $2, %xmm4, %xmm5
607; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
608; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
609; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
610; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
611; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm9
612; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm3
613; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
614; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
615; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
616; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
617; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
618; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3
619; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
620; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
621; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm7
622; AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm1, %xmm1
623; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3
624; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
625; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
626; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm2
627; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1
628; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
629; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
630; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0
631; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
632; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
633; AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm0, %xmm0
634; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
635; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
636; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
637; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
638; AVX1-NEXT: vorps %ymm0, %ymm9, %ymm0
639; AVX1-NEXT: retq
640;
641; AVX2-LABEL: constant_rotate_v32i8:
642; AVX2: # BB#0:
643; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
644; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
645; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
646; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
647; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm2
648; AVX2-NEXT: vpsllw $2, %ymm2, %ymm3
649; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
650; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
651; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm2
652; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3
653; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
654; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm1
655; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
656; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2
657; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3
658; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
659; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
660; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm3
661; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
662; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
663; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
664; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm3
665; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
666; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
667; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
668; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
669; AVX2-NEXT: retq
670;
671; XOPAVX1-LABEL: constant_rotate_v32i8:
672; XOPAVX1: # BB#0:
673; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
674; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
675; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm3
676; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm1
677; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
678; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
679; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm3, %xmm3
680; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
681; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
682; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
683; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
684; XOPAVX1-NEXT: retq
685;
686; XOPAVX2-LABEL: constant_rotate_v32i8:
687; XOPAVX2: # BB#0:
688; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
689; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
690; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm3
691; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm1
692; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
693; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
694; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm3, %xmm3
695; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
696; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0
697; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
698; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
699; XOPAVX2-NEXT: retq
700 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
701 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
702 %or = or <32 x i8> %shl, %lshr
703 ret <32 x i8> %or
704}
705
706;
707; Uniform Constant Rotates
708;
709
710define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
711; AVX1-LABEL: splatconstant_rotate_v4i64:
712; AVX1: # BB#0:
713; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1
714; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
715; AVX1-NEXT: vpsllq $14, %xmm2, %xmm3
716; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
717; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm0
718; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm2
719; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
720; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
721; AVX1-NEXT: retq
722;
723; AVX2-LABEL: splatconstant_rotate_v4i64:
724; AVX2: # BB#0:
725; AVX2-NEXT: vpsllq $14, %ymm0, %ymm1
726; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
727; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
728; AVX2-NEXT: retq
729;
730; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
731; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000732; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
733; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
734; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
735; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000736; XOPAVX1-NEXT: retq
737;
738; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
739; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000740; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
741; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
742; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
743; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000744; XOPAVX2-NEXT: retq
745 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
746 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
747 %or = or <4 x i64> %shl, %lshr
748 ret <4 x i64> %or
749}
750
751define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
752; AVX1-LABEL: splatconstant_rotate_v8i32:
753; AVX1: # BB#0:
754; AVX1-NEXT: vpslld $4, %xmm0, %xmm1
755; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
756; AVX1-NEXT: vpslld $4, %xmm2, %xmm3
757; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
758; AVX1-NEXT: vpsrld $28, %xmm0, %xmm0
759; AVX1-NEXT: vpsrld $28, %xmm2, %xmm2
760; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
761; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
762; AVX1-NEXT: retq
763;
764; AVX2-LABEL: splatconstant_rotate_v8i32:
765; AVX2: # BB#0:
766; AVX2-NEXT: vpslld $4, %ymm0, %ymm1
767; AVX2-NEXT: vpsrld $28, %ymm0, %ymm0
768; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
769; AVX2-NEXT: retq
770;
771; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
772; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000773; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
774; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
775; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
776; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000777; XOPAVX1-NEXT: retq
778;
779; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
780; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000781; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
782; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
783; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
784; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000785; XOPAVX2-NEXT: retq
786 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
787 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
788 %or = or <8 x i32> %shl, %lshr
789 ret <8 x i32> %or
790}
791
792define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
793; AVX1-LABEL: splatconstant_rotate_v16i16:
794; AVX1: # BB#0:
795; AVX1-NEXT: vpsllw $7, %xmm0, %xmm1
796; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
797; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
798; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
799; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm0
800; AVX1-NEXT: vpsrlw $9, %xmm2, %xmm2
801; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
802; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
803; AVX1-NEXT: retq
804;
805; AVX2-LABEL: splatconstant_rotate_v16i16:
806; AVX2: # BB#0:
807; AVX2-NEXT: vpsllw $7, %ymm0, %ymm1
808; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm0
809; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
810; AVX2-NEXT: retq
811;
812; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
813; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000814; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
815; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
816; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
817; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000818; XOPAVX1-NEXT: retq
819;
820; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
821; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000822; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
823; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
824; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
825; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000826; XOPAVX2-NEXT: retq
827 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
828 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
829 %or = or <16 x i16> %shl, %lshr
830 ret <16 x i16> %or
831}
832
833define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
834; AVX1-LABEL: splatconstant_rotate_v32i8:
835; AVX1: # BB#0:
836; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
837; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
838; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
839; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
840; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
841; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
842; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
843; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
844; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
845; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
846; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
847; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
848; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
849; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
850; AVX1-NEXT: retq
851;
852; AVX2-LABEL: splatconstant_rotate_v32i8:
853; AVX2: # BB#0:
854; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
855; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
856; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
857; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
858; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
859; AVX2-NEXT: retq
860;
861; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
862; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000863; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
864; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
865; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
866; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000867; XOPAVX1-NEXT: retq
868;
869; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
870; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000871; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
872; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
873; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
874; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000875; XOPAVX2-NEXT: retq
876 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
877 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
878 %or = or <32 x i8> %shl, %lshr
879 ret <32 x i8> %or
880}
881
882;
883; Masked Uniform Constant Rotates
884;
885
886define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
887; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
888; AVX1: # BB#0:
889; AVX1-NEXT: vpsllq $15, %xmm0, %xmm1
890; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
891; AVX1-NEXT: vpsllq $15, %xmm2, %xmm3
892; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
893; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
894; AVX1-NEXT: vpsrlq $49, %xmm2, %xmm2
895; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
896; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
897; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
898; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
899; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
900; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
901; AVX1-NEXT: retq
902;
903; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
904; AVX2: # BB#0:
905; AVX2-NEXT: vpsllq $15, %ymm0, %ymm1
906; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
907; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
908; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
909; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
910; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
911; AVX2-NEXT: retq
912;
913; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
914; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000915; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
916; XOPAVX1-NEXT: vprotq $15, %xmm1, %xmm1
917; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
918; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
919; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
920; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
921; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000922; XOPAVX1-NEXT: retq
923;
924; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
925; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000926; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
927; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
928; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
929; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000930; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000931; XOPAVX2-NEXT: retq
932 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
933 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
934 %rmask = and <4 x i64> %lshr, <i64 255, i64 255, i64 255, i64 255>
935 %lmask = and <4 x i64> %shl, <i64 33, i64 33, i64 33, i64 33>
936 %or = or <4 x i64> %lmask, %rmask
937 ret <4 x i64> %or
938}
939
940define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
941; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
942; AVX1: # BB#0:
943; AVX1-NEXT: vpslld $4, %xmm0, %xmm1
944; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
945; AVX1-NEXT: vpslld $4, %xmm2, %xmm3
946; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
947; AVX1-NEXT: vpsrld $28, %xmm0, %xmm0
948; AVX1-NEXT: vpsrld $28, %xmm2, %xmm2
949; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
950; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
951; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
952; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
953; AVX1-NEXT: retq
954;
955; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
956; AVX2: # BB#0:
957; AVX2-NEXT: vpslld $4, %ymm0, %ymm1
958; AVX2-NEXT: vpsrld $28, %ymm0, %ymm0
959; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
960; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
961; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
962; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
963; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
964; AVX2-NEXT: retq
965;
966; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
967; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000968; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
969; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
970; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
971; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000972; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000973; XOPAVX1-NEXT: retq
974;
975; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
976; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +0000977; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
978; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm2
979; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
980; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
981; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
982; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +0000983; XOPAVX2-NEXT: retq
984 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
985 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
986 %rmask = and <8 x i32> %lshr, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
987 %lmask = and <8 x i32> %shl, <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>
988 %or = or <8 x i32> %lmask, %rmask
989 ret <8 x i32> %or
990}
991
992define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
993; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
994; AVX1: # BB#0:
995; AVX1-NEXT: vpsllw $5, %xmm0, %xmm1
996; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
997; AVX1-NEXT: vpsllw $5, %xmm2, %xmm3
998; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
999; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm0
1000; AVX1-NEXT: vpsrlw $11, %xmm2, %xmm2
1001; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1002; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1003; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
1004; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
1005; AVX1-NEXT: retq
1006;
1007; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
1008; AVX2: # BB#0:
1009; AVX2-NEXT: vpsllw $5, %ymm0, %ymm1
1010; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm0
1011; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1012; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1013; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1014; AVX2-NEXT: retq
1015;
1016; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1017; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +00001018; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
1019; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1020; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
1021; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +00001022; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +00001023; XOPAVX1-NEXT: retq
1024;
1025; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1026; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +00001027; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
1028; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1029; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
1030; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +00001031; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +00001032; XOPAVX2-NEXT: retq
1033 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1034 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1035 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1036 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1037 %or = or <16 x i16> %lmask, %rmask
1038 ret <16 x i16> %or
1039}
1040
1041define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1042; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1043; AVX1: # BB#0:
1044; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1045; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
1046; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1047; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1048; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
1049; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
1050; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1051; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1052; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1053; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1054; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1055; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1056; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1057; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1058; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm1
1059; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
1060; AVX1-NEXT: retq
1061;
1062; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1063; AVX2: # BB#0:
1064; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1065; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1066; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1067; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1068; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1069; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1070; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1071; AVX2-NEXT: retq
1072;
1073; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1074; XOPAVX1: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +00001075; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1076; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1077; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1078; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +00001079; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +00001080; XOPAVX1-NEXT: retq
1081;
1082; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1083; XOPAVX2: # BB#0:
Simon Pilgrimd5ef3182015-10-24 13:17:26 +00001084; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1085; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1086; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1087; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +00001088; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
Simon Pilgrimbb178812015-10-20 20:27:23 +00001089; XOPAVX2-NEXT: retq
1090 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1091 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1092 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1093 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1094 %or = or <32 x i8> %lmask, %rmask
1095 ret <32 x i8> %or
1096}