blob: f556cb977ae0cd9dc01773241751ab2620a17eb0 [file] [log] [blame]
Michael Zuckerman0c20b692017-11-02 12:19:36 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
4
5define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
6; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
7; AVX512: # BB#0:
8; AVX512-NEXT: kmovb (%rdi), %k0
9; AVX512-NEXT: kshiftrw $4, %k0, %k0
10; AVX512-NEXT: vpmovm2q %k0, %xmm2
11; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
12; AVX512-NEXT: vpmovq2m %xmm2, %k1
13; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
14; AVX512-NEXT: vmovapd %xmm1, (%rsi)
15; AVX512-NEXT: retq
16;
17; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
18; AVX512NOTDQ: # BB#0:
19; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
20; AVX512NOTDQ-NEXT: kmovd %eax, %k0
21; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1
22; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
23; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
24; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
25; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
26; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
27; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
28; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
29; AVX512NOTDQ-NEXT: retq
30 %d0 = load <8 x i1>, <8 x i1>* %a0
31 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
32 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
33 store <2 x double> %d2, <2 x double>* %a3
34 ret void
35}
36define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
37; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
38; AVX512: # BB#0:
39; AVX512-NEXT: kmovb (%rdi), %k0
40; AVX512-NEXT: kshiftrw $6, %k0, %k0
41; AVX512-NEXT: vpmovm2q %k0, %xmm2
42; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
43; AVX512-NEXT: vpmovq2m %xmm2, %k1
44; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
45; AVX512-NEXT: vmovapd %xmm1, (%rsi)
46; AVX512-NEXT: retq
47;
48; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
49; AVX512NOTDQ: # BB#0:
50; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
51; AVX512NOTDQ-NEXT: kmovd %eax, %k0
52; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1
53; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
54; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
55; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
56; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
57; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
58; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
59; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
60; AVX512NOTDQ-NEXT: retq
61 %d0 = load <8 x i1>, <8 x i1>* %a0
62 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
63 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
64 store <2 x double> %d2, <2 x double>* %a3
65 ret void
66}
67define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
68; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
69; AVX512: # BB#0:
70; AVX512-NEXT: kmovw (%rdi), %k0
71; AVX512-NEXT: kshiftrw $8, %k0, %k0
72; AVX512-NEXT: vpmovm2q %k0, %xmm2
73; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
74; AVX512-NEXT: vpmovq2m %xmm2, %k1
75; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
76; AVX512-NEXT: vmovapd %xmm1, (%rsi)
77; AVX512-NEXT: retq
78;
79; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
80; AVX512NOTDQ: # BB#0:
81; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
82; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
83; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
84; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
85; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
86; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
87; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
88; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
89; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
90; AVX512NOTDQ-NEXT: retq
91 %d0 = load <16 x i1>, <16 x i1>* %a0
92 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
93 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
94 store <2 x double> %d2, <2 x double>* %a3
95 ret void
96}
97define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
98; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
99; AVX512: # BB#0:
100; AVX512-NEXT: kmovw (%rdi), %k0
101; AVX512-NEXT: kshiftrw $8, %k0, %k0
102; AVX512-NEXT: vpmovm2d %k0, %xmm2
103; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
104; AVX512-NEXT: vpmovd2m %xmm2, %k1
105; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
106; AVX512-NEXT: vmovaps %xmm1, (%rsi)
107; AVX512-NEXT: retq
108;
109; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
110; AVX512NOTDQ: # BB#0:
111; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
112; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
113; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
114; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
115; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
116; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
117; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
118; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
119; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
120; AVX512NOTDQ-NEXT: retq
121 %d0 = load <16 x i1>, <16 x i1>* %a0
122 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
123 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
124 store <4 x float> %d2, <4 x float>* %a3
125 ret void
126}
127define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
128; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
129; AVX512: # BB#0:
130; AVX512-NEXT: kmovw (%rdi), %k0
131; AVX512-NEXT: kshiftrw $14, %k0, %k0
132; AVX512-NEXT: vpmovm2q %k0, %xmm2
133; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
134; AVX512-NEXT: vpmovq2m %xmm2, %k1
135; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
136; AVX512-NEXT: vmovapd %xmm1, (%rsi)
137; AVX512-NEXT: retq
138;
139; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
140; AVX512NOTDQ: # BB#0:
141; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
142; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1
143; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
144; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
145; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
146; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
147; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
148; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
149; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
150; AVX512NOTDQ-NEXT: retq
151 %d0 = load <16 x i1>, <16 x i1>* %a0
152 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
153 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
154 store <2 x double> %d2, <2 x double>* %a3
155 ret void
156}
157define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
158; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
159; AVX512: # BB#0:
160; AVX512-NEXT: kmovw (%rdi), %k0
161; AVX512-NEXT: kshiftrw $12, %k0, %k0
162; AVX512-NEXT: vpmovm2d %k0, %xmm2
163; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
164; AVX512-NEXT: vpmovd2m %xmm2, %k1
165; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
166; AVX512-NEXT: vmovaps %xmm1, (%rsi)
167; AVX512-NEXT: retq
168;
169; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
170; AVX512NOTDQ: # BB#0:
171; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
172; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1
173; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
174; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
175; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
176; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
177; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
178; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
179; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
180; AVX512NOTDQ-NEXT: retq
181 %d0 = load <16 x i1>, <16 x i1>* %a0
182 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
183 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
184 store <4 x float> %d2, <4 x float>* %a3
185 ret void
186}
187define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
188; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
189; AVX512: # BB#0:
190; AVX512-NEXT: kmovd (%rdi), %k0
191; AVX512-NEXT: kshiftrd $16, %k0, %k0
192; AVX512-NEXT: vpmovm2q %k0, %xmm2
193; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
194; AVX512-NEXT: vpmovq2m %xmm2, %k1
195; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
196; AVX512-NEXT: vmovapd %xmm1, (%rsi)
197; AVX512-NEXT: retq
198;
199; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
200; AVX512NOTDQ: # BB#0:
201; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
202; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
203; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
204; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
205; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
206; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
207; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
208; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
209; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
210; AVX512NOTDQ-NEXT: retq
211 %d0 = load <32 x i1>, <32 x i1>* %a0
212 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
213 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
214 store <2 x double> %d2, <2 x double>* %a3
215 ret void
216}
217define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
218; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
219; AVX512: # BB#0:
220; AVX512-NEXT: kmovd (%rdi), %k0
221; AVX512-NEXT: kshiftrd $16, %k0, %k0
222; AVX512-NEXT: vpmovm2d %k0, %xmm2
223; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
224; AVX512-NEXT: vpmovd2m %xmm2, %k1
225; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
226; AVX512-NEXT: vmovaps %xmm1, (%rsi)
227; AVX512-NEXT: retq
228;
229; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
230; AVX512NOTDQ: # BB#0:
231; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
232; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
233; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
234; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
235; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
236; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
237; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
238; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
239; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
240; AVX512NOTDQ-NEXT: retq
241 %d0 = load <32 x i1>, <32 x i1>* %a0
242 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
243 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
244 store <4 x float> %d2, <4 x float>* %a3
245 ret void
246}
247define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
248; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
249; AVX512: # BB#0:
250; AVX512-NEXT: kmovd (%rdi), %k0
251; AVX512-NEXT: kshiftrd $16, %k0, %k0
252; AVX512-NEXT: vpmovm2q %k0, %zmm2
253; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2
254; AVX512-NEXT: vpmovq2m %zmm2, %k1
255; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
256; AVX512-NEXT: vmovaps %ymm1, (%rsi)
257; AVX512-NEXT: vzeroupper
258; AVX512-NEXT: retq
259;
260; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
261; AVX512NOTDQ: # BB#0:
262; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
263; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
264; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
265; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %zmm2
266; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
267; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
268; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
269; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
270; AVX512NOTDQ-NEXT: vzeroupper
271; AVX512NOTDQ-NEXT: retq
272 %d0 = load <32 x i1>, <32 x i1>* %a0
273 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
274 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
275 store <8 x float> %d2, <8 x float>* %a3
276 ret void
277}
278define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
279; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
280; AVX512: # BB#0:
281; AVX512-NEXT: kmovd (%rdi), %k0
282; AVX512-NEXT: kshiftrd $30, %k0, %k0
283; AVX512-NEXT: vpmovm2q %k0, %xmm2
284; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
285; AVX512-NEXT: vpmovq2m %xmm2, %k1
286; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
287; AVX512-NEXT: vmovapd %xmm1, (%rsi)
288; AVX512-NEXT: retq
289;
290; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
291; AVX512NOTDQ: # BB#0:
292; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
293; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1
294; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
295; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
296; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
297; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
298; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
299; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
300; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
301; AVX512NOTDQ-NEXT: retq
302 %d0 = load <32 x i1>, <32 x i1>* %a0
303 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
304 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
305 store <2 x double> %d2, <2 x double>* %a3
306 ret void
307}
308define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
309; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
310; AVX512: # BB#0:
311; AVX512-NEXT: kmovd (%rdi), %k0
312; AVX512-NEXT: kshiftrd $28, %k0, %k0
313; AVX512-NEXT: vpmovm2d %k0, %xmm2
314; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
315; AVX512-NEXT: vpmovd2m %xmm2, %k1
316; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
317; AVX512-NEXT: vmovaps %xmm1, (%rsi)
318; AVX512-NEXT: retq
319;
320; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
321; AVX512NOTDQ: # BB#0:
322; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
323; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1
324; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
325; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
326; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
327; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
328; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
329; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
330; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
331; AVX512NOTDQ-NEXT: retq
332 %d0 = load <32 x i1>, <32 x i1>* %a0
333 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
334 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
335 store <4 x float> %d2, <4 x float>* %a3
336 ret void
337}
338define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
339; AVX512-LABEL: load_v32i1_broadcast_31_v8i1:
340; AVX512: # BB#0:
341; AVX512-NEXT: kmovd (%rdi), %k0
342; AVX512-NEXT: kshiftrd $24, %k0, %k0
343; AVX512-NEXT: vpmovm2q %k0, %zmm2
344; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
345; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2
346; AVX512-NEXT: vpmovq2m %zmm2, %k1
347; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
348; AVX512-NEXT: vmovaps %ymm1, (%rsi)
349; AVX512-NEXT: vzeroupper
350; AVX512-NEXT: retq
351;
352; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1:
353; AVX512NOTDQ: # BB#0:
354; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
355; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
356; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
357; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
358; AVX512NOTDQ-NEXT: vpermq %zmm2, %zmm3, %zmm2
359; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
360; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
361; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
362; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
363; AVX512NOTDQ-NEXT: vzeroupper
364; AVX512NOTDQ-NEXT: retq
365 %d0 = load <32 x i1>, <32 x i1>* %a0
366 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
367 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
368 store <8 x float> %d2, <8 x float>* %a3
369 ret void
370}
371define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
372; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
373; AVX512: # BB#0:
374; AVX512-NEXT: kmovq (%rdi), %k0
375; AVX512-NEXT: kshiftrq $32, %k0, %k0
376; AVX512-NEXT: vpmovm2q %k0, %xmm2
377; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
378; AVX512-NEXT: vpmovq2m %xmm2, %k1
379; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
380; AVX512-NEXT: vmovapd %xmm1, (%rsi)
381; AVX512-NEXT: retq
382;
383; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
384; AVX512NOTDQ: # BB#0:
385; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
386; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
387; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
388; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
389; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
390; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
391; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
392; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
393; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
394; AVX512NOTDQ-NEXT: retq
395 %d0 = load <64 x i1>, <64 x i1>* %a0
396 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
397 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
398 store <2 x double> %d2, <2 x double>* %a3
399 ret void
400}
401define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
402; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
403; AVX512: # BB#0:
404; AVX512-NEXT: kmovq (%rdi), %k0
405; AVX512-NEXT: kshiftrq $32, %k0, %k0
406; AVX512-NEXT: vpmovm2d %k0, %xmm2
407; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
408; AVX512-NEXT: vpmovd2m %xmm2, %k1
409; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
410; AVX512-NEXT: vmovaps %xmm1, (%rsi)
411; AVX512-NEXT: retq
412;
413; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
414; AVX512NOTDQ: # BB#0:
415; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
416; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
417; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
418; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
419; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
420; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
421; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
422; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
423; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
424; AVX512NOTDQ-NEXT: retq
425 %d0 = load <64 x i1>, <64 x i1>* %a0
426 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
427 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
428 store <4 x float> %d2, <4 x float>* %a3
429 ret void
430}
431define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
432; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
433; AVX512: # BB#0:
434; AVX512-NEXT: kmovq (%rdi), %k0
435; AVX512-NEXT: kshiftrq $32, %k0, %k0
436; AVX512-NEXT: vpmovm2q %k0, %zmm2
437; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2
438; AVX512-NEXT: vpmovq2m %zmm2, %k1
439; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
440; AVX512-NEXT: vmovaps %ymm1, (%rsi)
441; AVX512-NEXT: vzeroupper
442; AVX512-NEXT: retq
443;
444; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
445; AVX512NOTDQ: # BB#0:
446; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
447; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
448; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
449; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %zmm2
450; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
451; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
452; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
453; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
454; AVX512NOTDQ-NEXT: vzeroupper
455; AVX512NOTDQ-NEXT: retq
456 %d0 = load <64 x i1>, <64 x i1>* %a0
457 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
458 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
459 store <8 x float> %d2, <8 x float>* %a3
460 ret void
461}
462define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
463; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
464; AVX512: # BB#0:
465; AVX512-NEXT: kmovq (%rdi), %k0
466; AVX512-NEXT: kshiftrq $32, %k0, %k0
467; AVX512-NEXT: vpmovm2d %k0, %zmm2
468; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2
469; AVX512-NEXT: vpmovd2m %zmm2, %k1
470; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
471; AVX512-NEXT: vmovaps %zmm1, (%rsi)
472; AVX512-NEXT: vzeroupper
473; AVX512-NEXT: retq
474;
475; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
476; AVX512NOTDQ: # BB#0:
477; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
478; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
479; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
480; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2
481; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2
482; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
483; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
484; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
485; AVX512NOTDQ-NEXT: vzeroupper
486; AVX512NOTDQ-NEXT: retq
487 %d0 = load <64 x i1>, <64 x i1>* %a0
488 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
489 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
490 store <16 x float> %d2, <16 x float>* %a3
491 ret void
492}
493define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
494; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
495; AVX512: # BB#0:
496; AVX512-NEXT: kmovq (%rdi), %k0
497; AVX512-NEXT: kshiftrq $62, %k0, %k0
498; AVX512-NEXT: vpmovm2q %k0, %xmm2
499; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
500; AVX512-NEXT: vpmovq2m %xmm2, %k1
501; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
502; AVX512-NEXT: vmovapd %xmm1, (%rsi)
503; AVX512-NEXT: retq
504;
505; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
506; AVX512NOTDQ: # BB#0:
507; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
508; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1
509; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
510; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
511; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
512; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
513; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
514; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
515; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
516; AVX512NOTDQ-NEXT: retq
517 %d0 = load <64 x i1>, <64 x i1>* %a0
518 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
519 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
520 store <2 x double> %d2, <2 x double>* %a3
521 ret void
522}
523define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
524; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
525; AVX512: # BB#0:
526; AVX512-NEXT: kmovq (%rdi), %k0
527; AVX512-NEXT: kshiftrq $60, %k0, %k0
528; AVX512-NEXT: vpmovm2d %k0, %xmm2
529; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
530; AVX512-NEXT: vpmovd2m %xmm2, %k1
531; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
532; AVX512-NEXT: vmovaps %xmm1, (%rsi)
533; AVX512-NEXT: retq
534;
535; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
536; AVX512NOTDQ: # BB#0:
537; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
538; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1
539; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
540; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
541; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
542; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
543; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
544; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
545; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
546; AVX512NOTDQ-NEXT: retq
547 %d0 = load <64 x i1>, <64 x i1>* %a0
548 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
549 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
550 store <4 x float> %d2, <4 x float>* %a3
551 ret void
552}
553define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
554; AVX512-LABEL: load_v64i1_broadcast_63_v8i1:
555; AVX512: # BB#0:
556; AVX512-NEXT: kmovq (%rdi), %k0
557; AVX512-NEXT: kshiftrq $56, %k0, %k0
558; AVX512-NEXT: vpmovm2q %k0, %zmm2
559; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
560; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2
561; AVX512-NEXT: vpmovq2m %zmm2, %k1
562; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
563; AVX512-NEXT: vmovaps %ymm1, (%rsi)
564; AVX512-NEXT: vzeroupper
565; AVX512-NEXT: retq
566;
567; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1:
568; AVX512NOTDQ: # BB#0:
569; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
570; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
571; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
572; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
573; AVX512NOTDQ-NEXT: vpermq %zmm2, %zmm3, %zmm2
574; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
575; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
576; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
577; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
578; AVX512NOTDQ-NEXT: vzeroupper
579; AVX512NOTDQ-NEXT: retq
580 %d0 = load <64 x i1>, <64 x i1>* %a0
581 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
582 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
583 store <8 x float> %d2, <8 x float>* %a3
584 ret void
585}
586define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
587; AVX512-LABEL: load_v64i1_broadcast_63_v16i1:
588; AVX512: # BB#0:
589; AVX512-NEXT: kmovq (%rdi), %k0
590; AVX512-NEXT: kshiftrq $48, %k0, %k0
591; AVX512-NEXT: vpmovm2d %k0, %zmm2
592; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
593; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2
594; AVX512-NEXT: vpmovd2m %zmm2, %k1
595; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
596; AVX512-NEXT: vmovaps %zmm1, (%rsi)
597; AVX512-NEXT: vzeroupper
598; AVX512-NEXT: retq
599;
600; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1:
601; AVX512NOTDQ: # BB#0:
602; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
603; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1
604; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
605; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
606; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2
607; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2
608; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
609; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
610; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
611; AVX512NOTDQ-NEXT: vzeroupper
612; AVX512NOTDQ-NEXT: retq
613 %d0 = load <64 x i1>, <64 x i1>* %a0
614 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
615 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
616 store <16 x float> %d2, <16 x float>* %a3
617 ret void
618}
619define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
620; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
621; AVX512: # BB#0:
622; AVX512-NEXT: kmovb (%rdi), %k0
623; AVX512-NEXT: kshiftrw $1, %k0, %k0
624; AVX512-NEXT: kmovb %k0, (%rsi)
625; AVX512-NEXT: retq
626;
627; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
628; AVX512NOTDQ: # BB#0:
629; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
630; AVX512NOTDQ-NEXT: kmovd %eax, %k0
631; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
632; AVX512NOTDQ-NEXT: kmovd %k0, %eax
633; AVX512NOTDQ-NEXT: movb %al, (%rsi)
634; AVX512NOTDQ-NEXT: retq
635 %d0 = load <2 x i1>, <2 x i1>* %a0
636 %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
637 store <1 x i1> %d1, <1 x i1>* %a1
638 ret void
639}
640define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
641; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
642; AVX512: # BB#0:
643; AVX512-NEXT: kmovb (%rdi), %k0
644; AVX512-NEXT: kshiftrw $1, %k0, %k0
645; AVX512-NEXT: kmovb %k0, (%rsi)
646; AVX512-NEXT: retq
647;
648; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
649; AVX512NOTDQ: # BB#0:
650; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
651; AVX512NOTDQ-NEXT: kmovd %eax, %k0
652; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
653; AVX512NOTDQ-NEXT: kmovd %k0, %eax
654; AVX512NOTDQ-NEXT: movb %al, (%rsi)
655; AVX512NOTDQ-NEXT: retq
656 %d0 = load <3 x i1>, <3 x i1>* %a0
657 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
658 store <1 x i1> %d1, <1 x i1>* %a1
659 ret void
660}
661define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
662; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
663; AVX512: # BB#0:
664; AVX512-NEXT: kmovb (%rdi), %k0
665; AVX512-NEXT: kshiftrw $2, %k0, %k0
666; AVX512-NEXT: kmovb %k0, (%rsi)
667; AVX512-NEXT: retq
668;
669; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
670; AVX512NOTDQ: # BB#0:
671; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
672; AVX512NOTDQ-NEXT: kmovd %eax, %k0
673; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
674; AVX512NOTDQ-NEXT: kmovd %k0, %eax
675; AVX512NOTDQ-NEXT: movb %al, (%rsi)
676; AVX512NOTDQ-NEXT: retq
677 %d0 = load <3 x i1>, <3 x i1>* %a0
678 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
679 store <1 x i1> %d1, <1 x i1>* %a1
680 ret void
681}
682define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
683; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
684; AVX512: # BB#0:
685; AVX512-NEXT: kmovb (%rdi), %k0
686; AVX512-NEXT: kshiftrw $2, %k0, %k0
687; AVX512-NEXT: kmovb %k0, (%rsi)
688; AVX512-NEXT: retq
689;
690; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
691; AVX512NOTDQ: # BB#0:
692; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
693; AVX512NOTDQ-NEXT: kmovd %eax, %k0
694; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
695; AVX512NOTDQ-NEXT: kmovd %k0, %eax
696; AVX512NOTDQ-NEXT: movb %al, (%rsi)
697; AVX512NOTDQ-NEXT: retq
698 %d0 = load <4 x i1>, <4 x i1>* %a0
699 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
700 store <1 x i1> %d1, <1 x i1>* %a1
701 ret void
702}
703define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
704; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
705; AVX512: # BB#0:
706; AVX512-NEXT: kmovb (%rdi), %k0
707; AVX512-NEXT: kshiftrw $3, %k0, %k0
708; AVX512-NEXT: kmovb %k0, (%rsi)
709; AVX512-NEXT: retq
710;
711; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
712; AVX512NOTDQ: # BB#0:
713; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
714; AVX512NOTDQ-NEXT: kmovd %eax, %k0
715; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0
716; AVX512NOTDQ-NEXT: kmovd %k0, %eax
717; AVX512NOTDQ-NEXT: movb %al, (%rsi)
718; AVX512NOTDQ-NEXT: retq
719 %d0 = load <4 x i1>, <4 x i1>* %a0
720 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
721 store <1 x i1> %d1, <1 x i1>* %a1
722 ret void
723}
724define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
725; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
726; AVX512: # BB#0:
727; AVX512-NEXT: kmovb (%rdi), %k0
728; AVX512-NEXT: kshiftrw $4, %k0, %k0
729; AVX512-NEXT: kmovb %k0, (%rsi)
730; AVX512-NEXT: retq
731;
732; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
733; AVX512NOTDQ: # BB#0:
734; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
735; AVX512NOTDQ-NEXT: kmovd %eax, %k0
736; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0
737; AVX512NOTDQ-NEXT: kmovd %k0, %eax
738; AVX512NOTDQ-NEXT: movb %al, (%rsi)
739; AVX512NOTDQ-NEXT: retq
740 %d0 = load <8 x i1>, <8 x i1>* %a0
741 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
742 store <1 x i1> %d1, <1 x i1>* %a1
743 ret void
744}
745define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
746; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
747; AVX512: # BB#0:
748; AVX512-NEXT: kmovb (%rdi), %k0
749; AVX512-NEXT: kshiftrw $4, %k0, %k0
750; AVX512-NEXT: vpmovm2q %k0, %xmm0
751; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
752; AVX512-NEXT: vpmovq2m %xmm0, %k0
753; AVX512-NEXT: kmovb %k0, (%rsi)
754; AVX512-NEXT: retq
755;
756; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
757; AVX512NOTDQ: # BB#0:
758; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
759; AVX512NOTDQ-NEXT: kmovd %eax, %k0
760; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1
761; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
762; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
763; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
764; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
765; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
766; AVX512NOTDQ-NEXT: kmovd %k0, %eax
767; AVX512NOTDQ-NEXT: movb %al, (%rsi)
768; AVX512NOTDQ-NEXT: retq
769 %d0 = load <8 x i1>, <8 x i1>* %a0
770 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
771 store <2 x i1> %d1, <2 x i1>* %a1
772 ret void
773}
774define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
775; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
776; AVX512: # BB#0:
777; AVX512-NEXT: kmovb (%rdi), %k0
778; AVX512-NEXT: kshiftrw $7, %k0, %k0
779; AVX512-NEXT: kmovb %k0, (%rsi)
780; AVX512-NEXT: retq
781;
782; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
783; AVX512NOTDQ: # BB#0:
784; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
785; AVX512NOTDQ-NEXT: kmovd %eax, %k0
786; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0
787; AVX512NOTDQ-NEXT: kmovd %k0, %eax
788; AVX512NOTDQ-NEXT: movb %al, (%rsi)
789; AVX512NOTDQ-NEXT: retq
790 %d0 = load <8 x i1>, <8 x i1>* %a0
791 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
792 store <1 x i1> %d1, <1 x i1>* %a1
793 ret void
794}
795define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
796; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
797; AVX512: # BB#0:
798; AVX512-NEXT: kmovb (%rdi), %k0
799; AVX512-NEXT: kshiftrw $6, %k0, %k0
800; AVX512-NEXT: vpmovm2q %k0, %xmm0
801; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
802; AVX512-NEXT: vpmovq2m %xmm0, %k0
803; AVX512-NEXT: kmovb %k0, (%rsi)
804; AVX512-NEXT: retq
805;
806; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
807; AVX512NOTDQ: # BB#0:
808; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
809; AVX512NOTDQ-NEXT: kmovd %eax, %k0
810; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1
811; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
812; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
813; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
814; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
815; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
816; AVX512NOTDQ-NEXT: kmovd %k0, %eax
817; AVX512NOTDQ-NEXT: movb %al, (%rsi)
818; AVX512NOTDQ-NEXT: retq
819 %d0 = load <8 x i1>, <8 x i1>* %a0
820 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
821 store <2 x i1> %d1, <2 x i1>* %a1
822 ret void
823}
824define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
825; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
826; AVX512: # BB#0:
827; AVX512-NEXT: kmovw (%rdi), %k0
828; AVX512-NEXT: kshiftrw $8, %k0, %k0
829; AVX512-NEXT: kmovb %k0, (%rsi)
830; AVX512-NEXT: retq
831;
832; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
833; AVX512NOTDQ: # BB#0:
834; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
835; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0
836; AVX512NOTDQ-NEXT: kmovd %k0, %eax
837; AVX512NOTDQ-NEXT: movb %al, (%rsi)
838; AVX512NOTDQ-NEXT: retq
839 %d0 = load <16 x i1>, <16 x i1>* %a0
840 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
841 store <1 x i1> %d1, <1 x i1>* %a1
842 ret void
843}
844define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
845; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
846; AVX512: # BB#0:
847; AVX512-NEXT: kmovw (%rdi), %k0
848; AVX512-NEXT: kshiftrw $8, %k0, %k0
849; AVX512-NEXT: vpmovm2q %k0, %xmm0
850; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
851; AVX512-NEXT: vpmovq2m %xmm0, %k0
852; AVX512-NEXT: kmovb %k0, (%rsi)
853; AVX512-NEXT: retq
854;
855; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
856; AVX512NOTDQ: # BB#0:
857; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
858; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
859; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
860; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
861; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
862; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
863; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
864; AVX512NOTDQ-NEXT: kmovd %k0, %eax
865; AVX512NOTDQ-NEXT: movb %al, (%rsi)
866; AVX512NOTDQ-NEXT: retq
867 %d0 = load <16 x i1>, <16 x i1>* %a0
868 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
869 store <2 x i1> %d1, <2 x i1>* %a1
870 ret void
871}
872define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
873; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
874; AVX512: # BB#0:
875; AVX512-NEXT: kmovw (%rdi), %k0
876; AVX512-NEXT: kshiftrw $8, %k0, %k0
877; AVX512-NEXT: vpmovm2d %k0, %xmm0
878; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
879; AVX512-NEXT: vpmovd2m %xmm0, %k0
880; AVX512-NEXT: kmovb %k0, (%rsi)
881; AVX512-NEXT: retq
882;
883; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
884; AVX512NOTDQ: # BB#0:
885; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
886; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
887; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
888; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
889; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
890; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
891; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
892; AVX512NOTDQ-NEXT: kmovd %k0, %eax
893; AVX512NOTDQ-NEXT: movb %al, (%rsi)
894; AVX512NOTDQ-NEXT: retq
895 %d0 = load <16 x i1>, <16 x i1>* %a0
896 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
897 store <4 x i1> %d1, <4 x i1>* %a1
898 ret void
899}
900define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
901; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
902; AVX512: # BB#0:
903; AVX512-NEXT: kmovw (%rdi), %k0
904; AVX512-NEXT: kshiftrw $15, %k0, %k0
905; AVX512-NEXT: kmovb %k0, (%rsi)
906; AVX512-NEXT: retq
907;
908; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
909; AVX512NOTDQ: # BB#0:
910; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
911; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
912; AVX512NOTDQ-NEXT: kmovd %k0, %eax
913; AVX512NOTDQ-NEXT: movb %al, (%rsi)
914; AVX512NOTDQ-NEXT: retq
915 %d0 = load <16 x i1>, <16 x i1>* %a0
916 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
917 store <1 x i1> %d1, <1 x i1>* %a1
918 ret void
919}
920define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
921; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
922; AVX512: # BB#0:
923; AVX512-NEXT: kmovw (%rdi), %k0
924; AVX512-NEXT: kshiftrw $14, %k0, %k0
925; AVX512-NEXT: vpmovm2q %k0, %xmm0
926; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
927; AVX512-NEXT: vpmovq2m %xmm0, %k0
928; AVX512-NEXT: kmovb %k0, (%rsi)
929; AVX512-NEXT: retq
930;
931; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
932; AVX512NOTDQ: # BB#0:
933; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
934; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1
935; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
936; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
937; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
938; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
939; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
940; AVX512NOTDQ-NEXT: kmovd %k0, %eax
941; AVX512NOTDQ-NEXT: movb %al, (%rsi)
942; AVX512NOTDQ-NEXT: retq
943 %d0 = load <16 x i1>, <16 x i1>* %a0
944 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
945 store <2 x i1> %d1, <2 x i1>* %a1
946 ret void
947}
948define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
949; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
950; AVX512: # BB#0:
951; AVX512-NEXT: kmovw (%rdi), %k0
952; AVX512-NEXT: kshiftrw $12, %k0, %k0
953; AVX512-NEXT: vpmovm2d %k0, %xmm0
954; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
955; AVX512-NEXT: vpmovd2m %xmm0, %k0
956; AVX512-NEXT: kmovb %k0, (%rsi)
957; AVX512-NEXT: retq
958;
959; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
960; AVX512NOTDQ: # BB#0:
961; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
962; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1
963; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
964; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
965; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
966; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
967; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
968; AVX512NOTDQ-NEXT: kmovd %k0, %eax
969; AVX512NOTDQ-NEXT: movb %al, (%rsi)
970; AVX512NOTDQ-NEXT: retq
971 %d0 = load <16 x i1>, <16 x i1>* %a0
972 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
973 store <4 x i1> %d1, <4 x i1>* %a1
974 ret void
975}
976define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
977; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
978; AVX512: # BB#0:
979; AVX512-NEXT: kmovd (%rdi), %k0
980; AVX512-NEXT: kshiftrd $16, %k0, %k0
981; AVX512-NEXT: kmovb %k0, (%rsi)
982; AVX512-NEXT: retq
983;
984; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
985; AVX512NOTDQ: # BB#0:
986; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
987; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0
988; AVX512NOTDQ-NEXT: kmovd %k0, %eax
989; AVX512NOTDQ-NEXT: movb %al, (%rsi)
990; AVX512NOTDQ-NEXT: retq
991 %d0 = load <32 x i1>, <32 x i1>* %a0
992 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
993 store <1 x i1> %d1, <1 x i1>* %a1
994 ret void
995}
996define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
997; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
998; AVX512: # BB#0:
999; AVX512-NEXT: kmovd (%rdi), %k0
1000; AVX512-NEXT: kshiftrd $16, %k0, %k0
1001; AVX512-NEXT: vpmovm2q %k0, %xmm0
1002; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
1003; AVX512-NEXT: vpmovq2m %xmm0, %k0
1004; AVX512-NEXT: kmovb %k0, (%rsi)
1005; AVX512-NEXT: retq
1006;
1007; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
1008; AVX512NOTDQ: # BB#0:
1009; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1010; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
1011; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1012; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1013; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
1014; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
1015; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1016; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1017; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1018; AVX512NOTDQ-NEXT: retq
1019 %d0 = load <32 x i1>, <32 x i1>* %a0
1020 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
1021 store <2 x i1> %d1, <2 x i1>* %a1
1022 ret void
1023}
1024define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1025; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
1026; AVX512: # BB#0:
1027; AVX512-NEXT: kmovd (%rdi), %k0
1028; AVX512-NEXT: kshiftrd $16, %k0, %k0
1029; AVX512-NEXT: vpmovm2d %k0, %xmm0
1030; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
1031; AVX512-NEXT: vpmovd2m %xmm0, %k0
1032; AVX512-NEXT: kmovb %k0, (%rsi)
1033; AVX512-NEXT: retq
1034;
1035; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
1036; AVX512NOTDQ: # BB#0:
1037; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1038; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
1039; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1040; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1041; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
1042; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
1043; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1044; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1045; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1046; AVX512NOTDQ-NEXT: retq
1047 %d0 = load <32 x i1>, <32 x i1>* %a0
1048 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
1049 store <4 x i1> %d1, <4 x i1>* %a1
1050 ret void
1051}
1052define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1053; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
1054; AVX512: # BB#0:
1055; AVX512-NEXT: kmovd (%rdi), %k0
1056; AVX512-NEXT: kshiftrd $16, %k0, %k0
1057; AVX512-NEXT: vpmovm2q %k0, %zmm0
1058; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
1059; AVX512-NEXT: vpmovq2m %zmm0, %k0
1060; AVX512-NEXT: kmovb %k0, (%rsi)
1061; AVX512-NEXT: vzeroupper
1062; AVX512-NEXT: retq
1063;
1064; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
1065; AVX512NOTDQ: # BB#0:
1066; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1067; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
1068; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1069; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %zmm0
1070; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
1071; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
1072; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1073; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1074; AVX512NOTDQ-NEXT: vzeroupper
1075; AVX512NOTDQ-NEXT: retq
1076 %d0 = load <32 x i1>, <32 x i1>* %a0
1077 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
1078 store <8 x i1> %d1, <8 x i1>* %a1
1079 ret void
1080}
1081define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
1082; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
1083; AVX512: # BB#0:
1084; AVX512-NEXT: kmovd (%rdi), %k0
1085; AVX512-NEXT: kshiftrd $31, %k0, %k0
1086; AVX512-NEXT: kmovb %k0, (%rsi)
1087; AVX512-NEXT: retq
1088;
1089; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
1090; AVX512NOTDQ: # BB#0:
1091; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1092; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0
1093; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1094; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1095; AVX512NOTDQ-NEXT: retq
1096 %d0 = load <32 x i1>, <32 x i1>* %a0
1097 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
1098 store <1 x i1> %d1, <1 x i1>* %a1
1099 ret void
1100}
1101define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1102; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
1103; AVX512: # BB#0:
1104; AVX512-NEXT: kmovd (%rdi), %k0
1105; AVX512-NEXT: kshiftrd $30, %k0, %k0
1106; AVX512-NEXT: vpmovm2q %k0, %xmm0
1107; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1108; AVX512-NEXT: vpmovq2m %xmm0, %k0
1109; AVX512-NEXT: kmovb %k0, (%rsi)
1110; AVX512-NEXT: retq
1111;
1112; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
1113; AVX512NOTDQ: # BB#0:
1114; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1115; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1
1116; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1117; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1118; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1119; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
1120; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1121; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1122; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1123; AVX512NOTDQ-NEXT: retq
1124 %d0 = load <32 x i1>, <32 x i1>* %a0
1125 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
1126 store <2 x i1> %d1, <2 x i1>* %a1
1127 ret void
1128}
1129define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1130; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
1131; AVX512: # BB#0:
1132; AVX512-NEXT: kmovd (%rdi), %k0
1133; AVX512-NEXT: kshiftrd $28, %k0, %k0
1134; AVX512-NEXT: vpmovm2d %k0, %xmm0
1135; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1136; AVX512-NEXT: vpmovd2m %xmm0, %k0
1137; AVX512-NEXT: kmovb %k0, (%rsi)
1138; AVX512-NEXT: retq
1139;
1140; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
1141; AVX512NOTDQ: # BB#0:
1142; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1143; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1
1144; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1145; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1146; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1147; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
1148; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1149; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1150; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1151; AVX512NOTDQ-NEXT: retq
1152 %d0 = load <32 x i1>, <32 x i1>* %a0
1153 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
1154 store <4 x i1> %d1, <4 x i1>* %a1
1155 ret void
1156}
1157define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1158; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store:
1159; AVX512: # BB#0:
1160; AVX512-NEXT: kmovd (%rdi), %k0
1161; AVX512-NEXT: kshiftrd $24, %k0, %k0
1162; AVX512-NEXT: vpmovm2q %k0, %zmm0
1163; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
1164; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
1165; AVX512-NEXT: vpmovq2m %zmm0, %k0
1166; AVX512-NEXT: kmovb %k0, (%rsi)
1167; AVX512-NEXT: vzeroupper
1168; AVX512-NEXT: retq
1169;
1170; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store:
1171; AVX512NOTDQ: # BB#0:
1172; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1173; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
1174; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1175; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
1176; AVX512NOTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
1177; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
1178; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
1179; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1180; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1181; AVX512NOTDQ-NEXT: vzeroupper
1182; AVX512NOTDQ-NEXT: retq
1183 %d0 = load <32 x i1>, <32 x i1>* %a0
1184 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
1185 store <8 x i1> %d1, <8 x i1>* %a1
1186 ret void
1187}
1188define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1189; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
1190; AVX512: # BB#0:
1191; AVX512-NEXT: kmovq (%rdi), %k0
1192; AVX512-NEXT: kshiftrq $32, %k0, %k0
1193; AVX512-NEXT: kmovb %k0, (%rsi)
1194; AVX512-NEXT: retq
1195;
1196; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
1197; AVX512NOTDQ: # BB#0:
1198; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1199; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0
1200; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1201; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1202; AVX512NOTDQ-NEXT: retq
1203 %d0 = load <64 x i1>, <64 x i1>* %a0
1204 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
1205 store <1 x i1> %d1, <1 x i1>* %a1
1206 ret void
1207}
1208define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1209; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
1210; AVX512: # BB#0:
1211; AVX512-NEXT: kmovq (%rdi), %k0
1212; AVX512-NEXT: kshiftrq $32, %k0, %k0
1213; AVX512-NEXT: vpmovm2q %k0, %xmm0
1214; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
1215; AVX512-NEXT: vpmovq2m %xmm0, %k0
1216; AVX512-NEXT: kmovb %k0, (%rsi)
1217; AVX512-NEXT: retq
1218;
1219; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
1220; AVX512NOTDQ: # BB#0:
1221; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1222; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1223; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1224; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1225; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
1226; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
1227; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1228; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1229; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1230; AVX512NOTDQ-NEXT: retq
1231 %d0 = load <64 x i1>, <64 x i1>* %a0
1232 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
1233 store <2 x i1> %d1, <2 x i1>* %a1
1234 ret void
1235}
1236define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1237; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
1238; AVX512: # BB#0:
1239; AVX512-NEXT: kmovq (%rdi), %k0
1240; AVX512-NEXT: kshiftrq $32, %k0, %k0
1241; AVX512-NEXT: vpmovm2d %k0, %xmm0
1242; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
1243; AVX512-NEXT: vpmovd2m %xmm0, %k0
1244; AVX512-NEXT: kmovb %k0, (%rsi)
1245; AVX512-NEXT: retq
1246;
1247; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
1248; AVX512NOTDQ: # BB#0:
1249; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1250; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1251; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1252; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1253; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
1254; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
1255; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1256; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1257; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1258; AVX512NOTDQ-NEXT: retq
1259 %d0 = load <64 x i1>, <64 x i1>* %a0
1260 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
1261 store <4 x i1> %d1, <4 x i1>* %a1
1262 ret void
1263}
1264define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1265; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
1266; AVX512: # BB#0:
1267; AVX512-NEXT: kmovq (%rdi), %k0
1268; AVX512-NEXT: kshiftrq $32, %k0, %k0
1269; AVX512-NEXT: vpmovm2q %k0, %zmm0
1270; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
1271; AVX512-NEXT: vpmovq2m %zmm0, %k0
1272; AVX512-NEXT: kmovb %k0, (%rsi)
1273; AVX512-NEXT: vzeroupper
1274; AVX512-NEXT: retq
1275;
1276; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
1277; AVX512NOTDQ: # BB#0:
1278; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1279; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1280; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1281; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %zmm0
1282; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
1283; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
1284; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1285; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1286; AVX512NOTDQ-NEXT: vzeroupper
1287; AVX512NOTDQ-NEXT: retq
1288 %d0 = load <64 x i1>, <64 x i1>* %a0
1289 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1290 store <8 x i1> %d1, <8 x i1>* %a1
1291 ret void
1292}
1293define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1294; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
1295; AVX512: # BB#0:
1296; AVX512-NEXT: kmovq (%rdi), %k0
1297; AVX512-NEXT: kshiftrq $32, %k0, %k0
1298; AVX512-NEXT: vpmovm2d %k0, %zmm0
1299; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
1300; AVX512-NEXT: vpmovd2m %zmm0, %k0
1301; AVX512-NEXT: kmovw %k0, (%rsi)
1302; AVX512-NEXT: vzeroupper
1303; AVX512-NEXT: retq
1304;
1305; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
1306; AVX512NOTDQ: # BB#0:
1307; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1308; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1309; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1310; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0
1311; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0
1312; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1313; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1314; AVX512NOTDQ-NEXT: vzeroupper
1315; AVX512NOTDQ-NEXT: retq
1316 %d0 = load <64 x i1>, <64 x i1>* %a0
1317 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1318 store <16 x i1> %d1, <16 x i1>* %a1
1319 ret void
1320}
1321define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1322; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
1323; AVX512: # BB#0:
1324; AVX512-NEXT: kmovq (%rdi), %k0
1325; AVX512-NEXT: kshiftrq $63, %k0, %k0
1326; AVX512-NEXT: kmovb %k0, (%rsi)
1327; AVX512-NEXT: retq
1328;
1329; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
1330; AVX512NOTDQ: # BB#0:
1331; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1332; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0
1333; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1334; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1335; AVX512NOTDQ-NEXT: retq
1336 %d0 = load <64 x i1>, <64 x i1>* %a0
1337 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
1338 store <1 x i1> %d1, <1 x i1>* %a1
1339 ret void
1340}
1341define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1342; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
1343; AVX512: # BB#0:
1344; AVX512-NEXT: kmovq (%rdi), %k0
1345; AVX512-NEXT: kshiftrq $62, %k0, %k0
1346; AVX512-NEXT: vpmovm2q %k0, %xmm0
1347; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1348; AVX512-NEXT: vpmovq2m %xmm0, %k0
1349; AVX512-NEXT: kmovb %k0, (%rsi)
1350; AVX512-NEXT: retq
1351;
1352; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
1353; AVX512NOTDQ: # BB#0:
1354; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1355; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1
1356; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1357; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1358; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1359; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
1360; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1361; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1362; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1363; AVX512NOTDQ-NEXT: retq
1364 %d0 = load <64 x i1>, <64 x i1>* %a0
1365 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
1366 store <2 x i1> %d1, <2 x i1>* %a1
1367 ret void
1368}
1369define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1370; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
1371; AVX512: # BB#0:
1372; AVX512-NEXT: kmovq (%rdi), %k0
1373; AVX512-NEXT: kshiftrq $60, %k0, %k0
1374; AVX512-NEXT: vpmovm2d %k0, %xmm0
1375; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1376; AVX512-NEXT: vpmovd2m %xmm0, %k0
1377; AVX512-NEXT: kmovb %k0, (%rsi)
1378; AVX512-NEXT: retq
1379;
1380; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
1381; AVX512NOTDQ: # BB#0:
1382; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1383; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1
1384; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1385; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1386; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1387; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
1388; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1389; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1390; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1391; AVX512NOTDQ-NEXT: retq
1392 %d0 = load <64 x i1>, <64 x i1>* %a0
1393 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
1394 store <4 x i1> %d1, <4 x i1>* %a1
1395 ret void
1396}
1397define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1398; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store:
1399; AVX512: # BB#0:
1400; AVX512-NEXT: kmovq (%rdi), %k0
1401; AVX512-NEXT: kshiftrq $56, %k0, %k0
1402; AVX512-NEXT: vpmovm2q %k0, %zmm0
1403; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
1404; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
1405; AVX512-NEXT: vpmovq2m %zmm0, %k0
1406; AVX512-NEXT: kmovb %k0, (%rsi)
1407; AVX512-NEXT: vzeroupper
1408; AVX512-NEXT: retq
1409;
1410; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store:
1411; AVX512NOTDQ: # BB#0:
1412; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1413; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
1414; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1415; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
1416; AVX512NOTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
1417; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
1418; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
1419; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1420; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1421; AVX512NOTDQ-NEXT: vzeroupper
1422; AVX512NOTDQ-NEXT: retq
1423 %d0 = load <64 x i1>, <64 x i1>* %a0
1424 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1425 store <8 x i1> %d1, <8 x i1>* %a1
1426 ret void
1427}
1428define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1429; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store:
1430; AVX512: # BB#0:
1431; AVX512-NEXT: kmovq (%rdi), %k0
1432; AVX512-NEXT: kshiftrq $48, %k0, %k0
1433; AVX512-NEXT: vpmovm2d %k0, %zmm0
1434; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1435; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0
1436; AVX512-NEXT: vpmovd2m %zmm0, %k0
1437; AVX512-NEXT: kmovw %k0, (%rsi)
1438; AVX512-NEXT: vzeroupper
1439; AVX512-NEXT: retq
1440;
1441; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store:
1442; AVX512NOTDQ: # BB#0:
1443; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1444; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1
1445; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1446; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1447; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
1448; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0
1449; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1450; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1451; AVX512NOTDQ-NEXT: vzeroupper
1452; AVX512NOTDQ-NEXT: retq
1453 %d0 = load <64 x i1>, <64 x i1>* %a0
1454 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1455 store <16 x i1> %d1, <16 x i1>* %a1
1456 ret void
1457}
1458