blob: 61edacdba9586edb8e27613078d3af8277e5b613 [file] [log] [blame]
Simon Pilgrimbe527b52018-11-14 20:44:59 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
Simon Pilgrim9d9353a2018-11-14 21:31:50 +00002; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4
4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
Simon Pilgrimbe527b52018-11-14 20:44:59 +00008
9define void @store_v1i32_v1i32(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +000010; SSE-LABEL: store_v1i32_v1i32:
11; SSE: ## %bb.0:
12; SSE-NEXT: testl %edi, %edi
13; SSE-NEXT: jne LBB0_2
14; SSE-NEXT: ## %bb.1: ## %cond.store
15; SSE-NEXT: movl %edx, (%rsi)
16; SSE-NEXT: LBB0_2: ## %else
17; SSE-NEXT: retq
18;
Simon Pilgrimbe527b52018-11-14 20:44:59 +000019; AVX-LABEL: store_v1i32_v1i32:
20; AVX: ## %bb.0:
21; AVX-NEXT: testl %edi, %edi
22; AVX-NEXT: jne LBB0_2
23; AVX-NEXT: ## %bb.1: ## %cond.store
24; AVX-NEXT: movl %edx, (%rsi)
25; AVX-NEXT: LBB0_2: ## %else
26; AVX-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +000027 %mask = icmp eq <1 x i32> %trigger, zeroinitializer
28 call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask)
29 ret void
30}
Simon Pilgrimbe527b52018-11-14 20:44:59 +000031
32define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +000033; SSE2-LABEL: store_v4i32_v4i32:
34; SSE2: ## %bb.0:
35; SSE2-NEXT: pxor %xmm2, %xmm2
36; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
37; SSE2-NEXT: movd %xmm2, %eax
38; SSE2-NEXT: testb $1, %al
39; SSE2-NEXT: je LBB1_2
40; SSE2-NEXT: ## %bb.1: ## %cond.store
41; SSE2-NEXT: movd %xmm1, (%rdi)
42; SSE2-NEXT: LBB1_2: ## %else
43; SSE2-NEXT: pextrw $2, %xmm2, %eax
44; SSE2-NEXT: testb $1, %al
45; SSE2-NEXT: je LBB1_4
46; SSE2-NEXT: ## %bb.3: ## %cond.store1
47; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
48; SSE2-NEXT: movd %xmm2, 4(%rdi)
49; SSE2-NEXT: LBB1_4: ## %else2
50; SSE2-NEXT: pxor %xmm2, %xmm2
51; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
52; SSE2-NEXT: pextrw $4, %xmm0, %eax
53; SSE2-NEXT: testb $1, %al
54; SSE2-NEXT: je LBB1_6
55; SSE2-NEXT: ## %bb.5: ## %cond.store3
56; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
57; SSE2-NEXT: movd %xmm2, 8(%rdi)
58; SSE2-NEXT: LBB1_6: ## %else4
59; SSE2-NEXT: pextrw $6, %xmm0, %eax
60; SSE2-NEXT: testb $1, %al
61; SSE2-NEXT: je LBB1_8
62; SSE2-NEXT: ## %bb.7: ## %cond.store5
63; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
64; SSE2-NEXT: movd %xmm0, 12(%rdi)
65; SSE2-NEXT: LBB1_8: ## %else6
66; SSE2-NEXT: retq
67;
68; SSE4-LABEL: store_v4i32_v4i32:
69; SSE4: ## %bb.0:
70; SSE4-NEXT: pxor %xmm2, %xmm2
71; SSE4-NEXT: pcmpeqd %xmm0, %xmm2
72; SSE4-NEXT: pextrb $0, %xmm2, %eax
73; SSE4-NEXT: testb $1, %al
74; SSE4-NEXT: je LBB1_2
75; SSE4-NEXT: ## %bb.1: ## %cond.store
76; SSE4-NEXT: movss %xmm1, (%rdi)
77; SSE4-NEXT: LBB1_2: ## %else
78; SSE4-NEXT: pextrb $4, %xmm2, %eax
79; SSE4-NEXT: testb $1, %al
80; SSE4-NEXT: je LBB1_4
81; SSE4-NEXT: ## %bb.3: ## %cond.store1
82; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi)
83; SSE4-NEXT: LBB1_4: ## %else2
84; SSE4-NEXT: pxor %xmm2, %xmm2
85; SSE4-NEXT: pcmpeqd %xmm2, %xmm0
86; SSE4-NEXT: pextrb $8, %xmm0, %eax
87; SSE4-NEXT: testb $1, %al
88; SSE4-NEXT: je LBB1_6
89; SSE4-NEXT: ## %bb.5: ## %cond.store3
90; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi)
91; SSE4-NEXT: LBB1_6: ## %else4
92; SSE4-NEXT: pextrb $12, %xmm0, %eax
93; SSE4-NEXT: testb $1, %al
94; SSE4-NEXT: je LBB1_8
95; SSE4-NEXT: ## %bb.7: ## %cond.store5
96; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi)
97; SSE4-NEXT: LBB1_8: ## %else6
98; SSE4-NEXT: retq
99;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000100; AVX1-LABEL: store_v4i32_v4i32:
101; AVX1: ## %bb.0:
102; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
103; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
104; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
105; AVX1-NEXT: retq
106;
107; AVX2-LABEL: store_v4i32_v4i32:
108; AVX2: ## %bb.0:
109; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
110; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
111; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
112; AVX2-NEXT: retq
113;
114; AVX512F-LABEL: store_v4i32_v4i32:
115; AVX512F: ## %bb.0:
116; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
117; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
118; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
119; AVX512F-NEXT: kshiftlw $12, %k0, %k0
120; AVX512F-NEXT: kshiftrw $12, %k0, %k1
121; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
122; AVX512F-NEXT: vzeroupper
123; AVX512F-NEXT: retq
124;
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000125; AVX512VLBW-LABEL: store_v4i32_v4i32:
126; AVX512VLBW: ## %bb.0:
127; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k1
128; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
129; AVX512VLBW-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000130 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
131 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
132 ret void
133}
134
135define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000136; SSE2-LABEL: store_v8i32_v8i32:
137; SSE2: ## %bb.0:
138; SSE2-NEXT: pxor %xmm4, %xmm4
139; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
140; SSE2-NEXT: movdqa %xmm4, %xmm5
141; SSE2-NEXT: packssdw %xmm0, %xmm5
142; SSE2-NEXT: movd %xmm5, %eax
143; SSE2-NEXT: testb $1, %al
144; SSE2-NEXT: je LBB2_2
145; SSE2-NEXT: ## %bb.1: ## %cond.store
146; SSE2-NEXT: movd %xmm2, (%rdi)
147; SSE2-NEXT: LBB2_2: ## %else
Sanjay Patel791ae692018-12-13 17:05:01 +0000148; SSE2-NEXT: psrlq $16, %xmm4
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000149; SSE2-NEXT: movd %xmm4, %eax
150; SSE2-NEXT: shrl $16, %eax
151; SSE2-NEXT: testb $1, %al
152; SSE2-NEXT: je LBB2_4
153; SSE2-NEXT: ## %bb.3: ## %cond.store1
154; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,2,3]
155; SSE2-NEXT: movd %xmm4, 4(%rdi)
156; SSE2-NEXT: LBB2_4: ## %else2
157; SSE2-NEXT: pxor %xmm4, %xmm4
158; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
159; SSE2-NEXT: pextrw $4, %xmm0, %eax
160; SSE2-NEXT: testb $1, %al
161; SSE2-NEXT: je LBB2_6
162; SSE2-NEXT: ## %bb.5: ## %cond.store3
163; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
164; SSE2-NEXT: movd %xmm4, 8(%rdi)
165; SSE2-NEXT: LBB2_6: ## %else4
166; SSE2-NEXT: pextrw $6, %xmm0, %eax
167; SSE2-NEXT: testb $1, %al
168; SSE2-NEXT: je LBB2_8
169; SSE2-NEXT: ## %bb.7: ## %cond.store5
170; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
171; SSE2-NEXT: movd %xmm0, 12(%rdi)
172; SSE2-NEXT: LBB2_8: ## %else6
173; SSE2-NEXT: pxor %xmm0, %xmm0
174; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
175; SSE2-NEXT: pextrw $0, %xmm0, %eax
176; SSE2-NEXT: testb $1, %al
177; SSE2-NEXT: je LBB2_10
178; SSE2-NEXT: ## %bb.9: ## %cond.store7
179; SSE2-NEXT: movd %xmm3, 16(%rdi)
180; SSE2-NEXT: LBB2_10: ## %else8
181; SSE2-NEXT: pextrw $2, %xmm0, %eax
182; SSE2-NEXT: testb $1, %al
183; SSE2-NEXT: je LBB2_12
184; SSE2-NEXT: ## %bb.11: ## %cond.store9
185; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
186; SSE2-NEXT: movd %xmm0, 20(%rdi)
187; SSE2-NEXT: LBB2_12: ## %else10
188; SSE2-NEXT: pxor %xmm0, %xmm0
189; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
190; SSE2-NEXT: pextrw $4, %xmm1, %eax
191; SSE2-NEXT: testb $1, %al
192; SSE2-NEXT: je LBB2_14
193; SSE2-NEXT: ## %bb.13: ## %cond.store11
194; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
195; SSE2-NEXT: movd %xmm0, 24(%rdi)
196; SSE2-NEXT: LBB2_14: ## %else12
197; SSE2-NEXT: pextrw $6, %xmm1, %eax
198; SSE2-NEXT: testb $1, %al
199; SSE2-NEXT: je LBB2_16
200; SSE2-NEXT: ## %bb.15: ## %cond.store13
201; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
202; SSE2-NEXT: movd %xmm0, 28(%rdi)
203; SSE2-NEXT: LBB2_16: ## %else14
204; SSE2-NEXT: retq
205;
206; SSE4-LABEL: store_v8i32_v8i32:
207; SSE4: ## %bb.0:
208; SSE4-NEXT: pxor %xmm4, %xmm4
209; SSE4-NEXT: pcmpeqd %xmm0, %xmm4
210; SSE4-NEXT: pextrb $0, %xmm4, %eax
211; SSE4-NEXT: testb $1, %al
212; SSE4-NEXT: je LBB2_2
213; SSE4-NEXT: ## %bb.1: ## %cond.store
214; SSE4-NEXT: movss %xmm2, (%rdi)
215; SSE4-NEXT: LBB2_2: ## %else
216; SSE4-NEXT: pextrb $4, %xmm4, %eax
217; SSE4-NEXT: testb $1, %al
218; SSE4-NEXT: je LBB2_4
219; SSE4-NEXT: ## %bb.3: ## %cond.store1
220; SSE4-NEXT: extractps $1, %xmm2, 4(%rdi)
221; SSE4-NEXT: LBB2_4: ## %else2
222; SSE4-NEXT: pxor %xmm4, %xmm4
223; SSE4-NEXT: pcmpeqd %xmm4, %xmm0
224; SSE4-NEXT: pextrb $8, %xmm0, %eax
225; SSE4-NEXT: testb $1, %al
226; SSE4-NEXT: je LBB2_6
227; SSE4-NEXT: ## %bb.5: ## %cond.store3
228; SSE4-NEXT: extractps $2, %xmm2, 8(%rdi)
229; SSE4-NEXT: LBB2_6: ## %else4
230; SSE4-NEXT: pextrb $12, %xmm0, %eax
231; SSE4-NEXT: testb $1, %al
232; SSE4-NEXT: je LBB2_8
233; SSE4-NEXT: ## %bb.7: ## %cond.store5
234; SSE4-NEXT: extractps $3, %xmm2, 12(%rdi)
235; SSE4-NEXT: LBB2_8: ## %else6
236; SSE4-NEXT: pxor %xmm0, %xmm0
237; SSE4-NEXT: pcmpeqd %xmm1, %xmm0
238; SSE4-NEXT: pextrb $0, %xmm0, %eax
239; SSE4-NEXT: testb $1, %al
240; SSE4-NEXT: je LBB2_10
241; SSE4-NEXT: ## %bb.9: ## %cond.store7
242; SSE4-NEXT: movss %xmm3, 16(%rdi)
243; SSE4-NEXT: LBB2_10: ## %else8
244; SSE4-NEXT: pextrb $4, %xmm0, %eax
245; SSE4-NEXT: testb $1, %al
246; SSE4-NEXT: je LBB2_12
247; SSE4-NEXT: ## %bb.11: ## %cond.store9
248; SSE4-NEXT: extractps $1, %xmm3, 20(%rdi)
249; SSE4-NEXT: LBB2_12: ## %else10
250; SSE4-NEXT: pxor %xmm0, %xmm0
251; SSE4-NEXT: pcmpeqd %xmm0, %xmm1
252; SSE4-NEXT: pextrb $8, %xmm1, %eax
253; SSE4-NEXT: testb $1, %al
254; SSE4-NEXT: je LBB2_14
255; SSE4-NEXT: ## %bb.13: ## %cond.store11
256; SSE4-NEXT: extractps $2, %xmm3, 24(%rdi)
257; SSE4-NEXT: LBB2_14: ## %else12
258; SSE4-NEXT: pextrb $12, %xmm1, %eax
259; SSE4-NEXT: testb $1, %al
260; SSE4-NEXT: je LBB2_16
261; SSE4-NEXT: ## %bb.15: ## %cond.store13
262; SSE4-NEXT: extractps $3, %xmm3, 28(%rdi)
263; SSE4-NEXT: LBB2_16: ## %else14
264; SSE4-NEXT: retq
265;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000266; AVX1-LABEL: store_v8i32_v8i32:
267; AVX1: ## %bb.0:
268; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
269; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
270; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
271; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
272; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
273; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
274; AVX1-NEXT: vzeroupper
275; AVX1-NEXT: retq
276;
277; AVX2-LABEL: store_v8i32_v8i32:
278; AVX2: ## %bb.0:
279; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
280; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
281; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
282; AVX2-NEXT: vzeroupper
283; AVX2-NEXT: retq
284;
285; AVX512F-LABEL: store_v8i32_v8i32:
286; AVX512F: ## %bb.0:
287; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
288; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
289; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
290; AVX512F-NEXT: kshiftlw $8, %k0, %k0
291; AVX512F-NEXT: kshiftrw $8, %k0, %k1
292; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
293; AVX512F-NEXT: vzeroupper
294; AVX512F-NEXT: retq
295;
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000296; AVX512VLBW-LABEL: store_v8i32_v8i32:
297; AVX512VLBW: ## %bb.0:
298; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1
299; AVX512VLBW-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
300; AVX512VLBW-NEXT: vzeroupper
301; AVX512VLBW-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000302 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
303 call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
304 ret void
305}
306
307define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000308; SSE2-LABEL: store_v2f32_v2i32:
309; SSE2: ## %bb.0:
310; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
311; SSE2-NEXT: pxor %xmm2, %xmm2
312; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
313; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
314; SSE2-NEXT: pand %xmm2, %xmm0
315; SSE2-NEXT: movd %xmm0, %eax
316; SSE2-NEXT: testb $1, %al
317; SSE2-NEXT: je LBB3_2
318; SSE2-NEXT: ## %bb.1: ## %cond.store
319; SSE2-NEXT: movss %xmm1, (%rdi)
320; SSE2-NEXT: LBB3_2: ## %else
321; SSE2-NEXT: pextrw $4, %xmm0, %eax
322; SSE2-NEXT: testb $1, %al
323; SSE2-NEXT: je LBB3_4
324; SSE2-NEXT: ## %bb.3: ## %cond.store1
325; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
326; SSE2-NEXT: movss %xmm1, 4(%rdi)
327; SSE2-NEXT: LBB3_4: ## %else2
328; SSE2-NEXT: retq
329;
330; SSE4-LABEL: store_v2f32_v2i32:
331; SSE4: ## %bb.0:
332; SSE4-NEXT: pxor %xmm2, %xmm2
333; SSE4-NEXT: movdqa %xmm0, %xmm3
334; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
335; SSE4-NEXT: pcmpeqq %xmm2, %xmm3
336; SSE4-NEXT: pextrb $0, %xmm3, %eax
337; SSE4-NEXT: testb $1, %al
338; SSE4-NEXT: je LBB3_2
339; SSE4-NEXT: ## %bb.1: ## %cond.store
340; SSE4-NEXT: movss %xmm1, (%rdi)
341; SSE4-NEXT: LBB3_2: ## %else
342; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
343; SSE4-NEXT: pcmpeqq %xmm2, %xmm0
344; SSE4-NEXT: pextrb $8, %xmm0, %eax
345; SSE4-NEXT: testb $1, %al
346; SSE4-NEXT: je LBB3_4
347; SSE4-NEXT: ## %bb.3: ## %cond.store1
348; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi)
349; SSE4-NEXT: LBB3_4: ## %else2
350; SSE4-NEXT: retq
351;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000352; AVX1-LABEL: store_v2f32_v2i32:
353; AVX1: ## %bb.0:
354; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
355; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
356; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
357; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
358; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
359; AVX1-NEXT: retq
360;
361; AVX2-LABEL: store_v2f32_v2i32:
362; AVX2: ## %bb.0:
363; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
364; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
365; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
366; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
367; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
368; AVX2-NEXT: retq
369;
370; AVX512F-LABEL: store_v2f32_v2i32:
371; AVX512F: ## %bb.0:
372; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
373; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
374; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
375; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
376; AVX512F-NEXT: kshiftlw $14, %k0, %k0
377; AVX512F-NEXT: kshiftrw $14, %k0, %k1
378; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1}
379; AVX512F-NEXT: vzeroupper
380; AVX512F-NEXT: retq
381;
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000382; AVX512VLBW-LABEL: store_v2f32_v2i32:
383; AVX512VLBW: ## %bb.0:
384; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
385; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
386; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1
387; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1}
388; AVX512VLBW-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000389 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
390 call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
391 ret void
392}
393
394define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000395; SSE2-LABEL: store_v2i32_v2i32:
396; SSE2: ## %bb.0:
397; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
398; SSE2-NEXT: pxor %xmm2, %xmm2
399; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
400; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
401; SSE2-NEXT: pand %xmm2, %xmm0
402; SSE2-NEXT: movd %xmm0, %eax
403; SSE2-NEXT: testb $1, %al
404; SSE2-NEXT: je LBB4_2
405; SSE2-NEXT: ## %bb.1: ## %cond.store
406; SSE2-NEXT: movd %xmm1, (%rdi)
407; SSE2-NEXT: LBB4_2: ## %else
408; SSE2-NEXT: pextrw $4, %xmm0, %eax
409; SSE2-NEXT: testb $1, %al
410; SSE2-NEXT: je LBB4_4
411; SSE2-NEXT: ## %bb.3: ## %cond.store1
412; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
413; SSE2-NEXT: movd %xmm0, 4(%rdi)
414; SSE2-NEXT: LBB4_4: ## %else2
415; SSE2-NEXT: retq
416;
417; SSE4-LABEL: store_v2i32_v2i32:
418; SSE4: ## %bb.0:
419; SSE4-NEXT: pxor %xmm2, %xmm2
420; SSE4-NEXT: movdqa %xmm0, %xmm3
421; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
422; SSE4-NEXT: pcmpeqq %xmm2, %xmm3
423; SSE4-NEXT: pextrb $0, %xmm3, %eax
424; SSE4-NEXT: testb $1, %al
425; SSE4-NEXT: je LBB4_2
426; SSE4-NEXT: ## %bb.1: ## %cond.store
427; SSE4-NEXT: movss %xmm1, (%rdi)
428; SSE4-NEXT: LBB4_2: ## %else
429; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
430; SSE4-NEXT: pcmpeqq %xmm2, %xmm0
431; SSE4-NEXT: pextrb $8, %xmm0, %eax
432; SSE4-NEXT: testb $1, %al
433; SSE4-NEXT: je LBB4_4
434; SSE4-NEXT: ## %bb.3: ## %cond.store1
435; SSE4-NEXT: extractps $2, %xmm1, 4(%rdi)
436; SSE4-NEXT: LBB4_4: ## %else2
437; SSE4-NEXT: retq
438;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000439; AVX1-LABEL: store_v2i32_v2i32:
440; AVX1: ## %bb.0:
441; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
442; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
443; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
444; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
445; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
446; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
447; AVX1-NEXT: retq
448;
449; AVX2-LABEL: store_v2i32_v2i32:
450; AVX2: ## %bb.0:
451; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
452; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
453; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
454; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
455; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
456; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
457; AVX2-NEXT: retq
458;
459; AVX512F-LABEL: store_v2i32_v2i32:
460; AVX512F: ## %bb.0:
461; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
462; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
463; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
464; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
465; AVX512F-NEXT: kshiftlw $14, %k0, %k0
466; AVX512F-NEXT: kshiftrw $14, %k0, %k1
467; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
468; AVX512F-NEXT: vzeroupper
469; AVX512F-NEXT: retq
470;
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000471; AVX512VLBW-LABEL: store_v2i32_v2i32:
472; AVX512VLBW: ## %bb.0:
473; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
474; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
475; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1
476; AVX512VLBW-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
477; AVX512VLBW-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000478 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
479 call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
480 ret void
481}
482
483define void @const_store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000484; SSE-LABEL: const_store_v4i32_v4i32:
485; SSE: ## %bb.0:
486; SSE-NEXT: movups %xmm1, (%rdi)
487; SSE-NEXT: retq
488;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000489; AVX1-LABEL: const_store_v4i32_v4i32:
490; AVX1: ## %bb.0:
491; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
492; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
493; AVX1-NEXT: retq
494;
495; AVX2-LABEL: const_store_v4i32_v4i32:
496; AVX2: ## %bb.0:
497; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
498; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
499; AVX2-NEXT: retq
500;
501; AVX512F-LABEL: const_store_v4i32_v4i32:
502; AVX512F: ## %bb.0:
503; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
504; AVX512F-NEXT: movw $15, %ax
505; AVX512F-NEXT: kmovw %eax, %k1
506; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
507; AVX512F-NEXT: vzeroupper
508; AVX512F-NEXT: retq
509;
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000510; AVX512VLBW-LABEL: const_store_v4i32_v4i32:
511; AVX512VLBW: ## %bb.0:
512; AVX512VLBW-NEXT: kxnorw %k0, %k0, %k1
513; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
514; AVX512VLBW-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000515 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
516 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
517 ret void
518}
519
520; When only one element of the mask is set, reduce to a scalar store.
521
522define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000523; SSE-LABEL: one_mask_bit_set1:
524; SSE: ## %bb.0:
525; SSE-NEXT: movss %xmm0, (%rdi)
526; SSE-NEXT: retq
527;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000528; AVX-LABEL: one_mask_bit_set1:
529; AVX: ## %bb.0:
530; AVX-NEXT: vmovss %xmm0, (%rdi)
531; AVX-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000532 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
533 ret void
534}
535
536; Choose a different element to show that the correct address offset is produced.
537
538define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000539; SSE2-LABEL: one_mask_bit_set2:
540; SSE2: ## %bb.0:
541; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
542; SSE2-NEXT: movss %xmm0, 8(%rdi)
543; SSE2-NEXT: retq
544;
545; SSE4-LABEL: one_mask_bit_set2:
546; SSE4: ## %bb.0:
547; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi)
548; SSE4-NEXT: retq
549;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000550; AVX-LABEL: one_mask_bit_set2:
551; AVX: ## %bb.0:
552; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
553; AVX-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000554 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
555 ret void
556}
557
558; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
559
560define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000561; SSE-LABEL: one_mask_bit_set3:
562; SSE: ## %bb.0:
563; SSE-NEXT: movlps %xmm1, 16(%rdi)
564; SSE-NEXT: retq
565;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000566; AVX-LABEL: one_mask_bit_set3:
567; AVX: ## %bb.0:
568; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
569; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
570; AVX-NEXT: vzeroupper
571; AVX-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000572 call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
573 ret void
574}
575
576; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
577
578define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000579; SSE-LABEL: one_mask_bit_set4:
580; SSE: ## %bb.0:
581; SSE-NEXT: movhpd %xmm1, 24(%rdi)
582; SSE-NEXT: retq
583;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000584; AVX-LABEL: one_mask_bit_set4:
585; AVX: ## %bb.0:
586; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
587; AVX-NEXT: vmovhpd %xmm0, 24(%rdi)
588; AVX-NEXT: vzeroupper
589; AVX-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000590 call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
591 ret void
592}
593
594; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
595
596define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000597; SSE-LABEL: one_mask_bit_set5:
598; SSE: ## %bb.0:
599; SSE-NEXT: movlps %xmm3, 48(%rdi)
600; SSE-NEXT: retq
601;
602; AVX1OR2-LABEL: one_mask_bit_set5:
603; AVX1OR2: ## %bb.0:
604; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0
605; AVX1OR2-NEXT: vmovlps %xmm0, 48(%rdi)
606; AVX1OR2-NEXT: vzeroupper
607; AVX1OR2-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000608;
609; AVX512-LABEL: one_mask_bit_set5:
610; AVX512: ## %bb.0:
611; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
612; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
613; AVX512-NEXT: vzeroupper
614; AVX512-NEXT: retq
615 call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
616 ret void
617}
618
619; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
620; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
621
622define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000623; SSE2-LABEL: trunc_mask:
624; SSE2: ## %bb.0:
625; SSE2-NEXT: pxor %xmm1, %xmm1
626; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
627; SSE2-NEXT: movd %xmm1, %eax
628; SSE2-NEXT: testb $1, %al
629; SSE2-NEXT: je LBB11_2
630; SSE2-NEXT: ## %bb.1: ## %cond.store
631; SSE2-NEXT: movss %xmm0, (%rdi)
632; SSE2-NEXT: LBB11_2: ## %else
633; SSE2-NEXT: pextrw $2, %xmm1, %eax
634; SSE2-NEXT: testb $1, %al
635; SSE2-NEXT: je LBB11_4
636; SSE2-NEXT: ## %bb.3: ## %cond.store1
637; SSE2-NEXT: movaps %xmm0, %xmm1
638; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
639; SSE2-NEXT: movss %xmm1, 4(%rdi)
640; SSE2-NEXT: LBB11_4: ## %else2
641; SSE2-NEXT: pxor %xmm1, %xmm1
642; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
643; SSE2-NEXT: pextrw $4, %xmm1, %eax
644; SSE2-NEXT: testb $1, %al
645; SSE2-NEXT: je LBB11_6
646; SSE2-NEXT: ## %bb.5: ## %cond.store3
647; SSE2-NEXT: movaps %xmm0, %xmm2
648; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
649; SSE2-NEXT: movss %xmm2, 8(%rdi)
650; SSE2-NEXT: LBB11_6: ## %else4
651; SSE2-NEXT: pextrw $6, %xmm1, %eax
652; SSE2-NEXT: testb $1, %al
653; SSE2-NEXT: je LBB11_8
654; SSE2-NEXT: ## %bb.7: ## %cond.store5
655; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
656; SSE2-NEXT: movss %xmm0, 12(%rdi)
657; SSE2-NEXT: LBB11_8: ## %else6
658; SSE2-NEXT: retq
659;
660; SSE4-LABEL: trunc_mask:
661; SSE4: ## %bb.0:
662; SSE4-NEXT: pxor %xmm1, %xmm1
663; SSE4-NEXT: pcmpgtd %xmm2, %xmm1
664; SSE4-NEXT: pextrb $0, %xmm1, %eax
665; SSE4-NEXT: testb $1, %al
666; SSE4-NEXT: je LBB11_2
667; SSE4-NEXT: ## %bb.1: ## %cond.store
668; SSE4-NEXT: movss %xmm0, (%rdi)
669; SSE4-NEXT: LBB11_2: ## %else
670; SSE4-NEXT: pextrb $4, %xmm1, %eax
671; SSE4-NEXT: testb $1, %al
672; SSE4-NEXT: je LBB11_4
673; SSE4-NEXT: ## %bb.3: ## %cond.store1
674; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi)
675; SSE4-NEXT: LBB11_4: ## %else2
676; SSE4-NEXT: pxor %xmm1, %xmm1
677; SSE4-NEXT: pcmpgtd %xmm2, %xmm1
678; SSE4-NEXT: pextrb $8, %xmm1, %eax
679; SSE4-NEXT: testb $1, %al
680; SSE4-NEXT: je LBB11_6
681; SSE4-NEXT: ## %bb.5: ## %cond.store3
682; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi)
683; SSE4-NEXT: LBB11_6: ## %else4
684; SSE4-NEXT: pextrb $12, %xmm1, %eax
685; SSE4-NEXT: testb $1, %al
686; SSE4-NEXT: je LBB11_8
687; SSE4-NEXT: ## %bb.7: ## %cond.store5
688; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi)
689; SSE4-NEXT: LBB11_8: ## %else6
690; SSE4-NEXT: retq
691;
692; AVX1OR2-LABEL: trunc_mask:
693; AVX1OR2: ## %bb.0:
694; AVX1OR2-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi)
695; AVX1OR2-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000696;
697; AVX512F-LABEL: trunc_mask:
698; AVX512F: ## %bb.0:
699; AVX512F-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2
700; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
701; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
702; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k0
703; AVX512F-NEXT: kshiftlw $12, %k0, %k0
704; AVX512F-NEXT: kshiftrw $12, %k0, %k1
705; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1}
706; AVX512F-NEXT: vzeroupper
707; AVX512F-NEXT: retq
708;
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000709; AVX512VLBW-LABEL: trunc_mask:
710; AVX512VLBW: ## %bb.0:
711; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
712; AVX512VLBW-NEXT: vpcmpgtd %xmm2, %xmm1, %k1
713; AVX512VLBW-NEXT: vmovups %xmm0, (%rdi) {%k1}
714; AVX512VLBW-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000715 %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
716 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask)
717 ret void
718}
719
720; SimplifyDemandedBits eliminates an ashr here.
721
722define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x double>* %p, <4 x i32> %masksrc) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000723; SSE2-LABEL: masked_store_bool_mask_demand_trunc_sext:
724; SSE2: ## %bb.0:
725; SSE2-NEXT: movd %xmm2, %eax
726; SSE2-NEXT: testb $1, %al
727; SSE2-NEXT: je LBB12_2
728; SSE2-NEXT: ## %bb.1: ## %cond.store
729; SSE2-NEXT: movlpd %xmm0, (%rdi)
730; SSE2-NEXT: LBB12_2: ## %else
731; SSE2-NEXT: pextrw $2, %xmm2, %eax
732; SSE2-NEXT: testb $1, %al
733; SSE2-NEXT: je LBB12_4
734; SSE2-NEXT: ## %bb.3: ## %cond.store1
735; SSE2-NEXT: movhpd %xmm0, 8(%rdi)
736; SSE2-NEXT: LBB12_4: ## %else2
737; SSE2-NEXT: pextrw $4, %xmm2, %eax
738; SSE2-NEXT: testb $1, %al
739; SSE2-NEXT: je LBB12_6
740; SSE2-NEXT: ## %bb.5: ## %cond.store3
741; SSE2-NEXT: movlpd %xmm1, 16(%rdi)
742; SSE2-NEXT: LBB12_6: ## %else4
743; SSE2-NEXT: pextrw $6, %xmm2, %eax
744; SSE2-NEXT: testb $1, %al
745; SSE2-NEXT: je LBB12_8
746; SSE2-NEXT: ## %bb.7: ## %cond.store5
747; SSE2-NEXT: movhpd %xmm1, 24(%rdi)
748; SSE2-NEXT: LBB12_8: ## %else6
749; SSE2-NEXT: retq
750;
751; SSE4-LABEL: masked_store_bool_mask_demand_trunc_sext:
752; SSE4: ## %bb.0:
753; SSE4-NEXT: pextrb $0, %xmm2, %eax
754; SSE4-NEXT: testb $1, %al
755; SSE4-NEXT: je LBB12_2
756; SSE4-NEXT: ## %bb.1: ## %cond.store
757; SSE4-NEXT: movlpd %xmm0, (%rdi)
758; SSE4-NEXT: LBB12_2: ## %else
759; SSE4-NEXT: pextrb $4, %xmm2, %eax
760; SSE4-NEXT: testb $1, %al
761; SSE4-NEXT: je LBB12_4
762; SSE4-NEXT: ## %bb.3: ## %cond.store1
763; SSE4-NEXT: movhpd %xmm0, 8(%rdi)
764; SSE4-NEXT: LBB12_4: ## %else2
765; SSE4-NEXT: pextrb $8, %xmm2, %eax
766; SSE4-NEXT: testb $1, %al
767; SSE4-NEXT: je LBB12_6
768; SSE4-NEXT: ## %bb.5: ## %cond.store3
769; SSE4-NEXT: movlpd %xmm1, 16(%rdi)
770; SSE4-NEXT: LBB12_6: ## %else4
771; SSE4-NEXT: pextrb $12, %xmm2, %eax
772; SSE4-NEXT: testb $1, %al
773; SSE4-NEXT: je LBB12_8
774; SSE4-NEXT: ## %bb.7: ## %cond.store5
775; SSE4-NEXT: movhpd %xmm1, 24(%rdi)
776; SSE4-NEXT: LBB12_8: ## %else6
777; SSE4-NEXT: retq
778;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000779; AVX1-LABEL: masked_store_bool_mask_demand_trunc_sext:
780; AVX1: ## %bb.0:
781; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
782; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
783; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
784; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
785; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
786; AVX1-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi)
787; AVX1-NEXT: vzeroupper
788; AVX1-NEXT: retq
789;
790; AVX2-LABEL: masked_store_bool_mask_demand_trunc_sext:
791; AVX2: ## %bb.0:
792; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
793; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
794; AVX2-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi)
795; AVX2-NEXT: vzeroupper
796; AVX2-NEXT: retq
797;
798; AVX512F-LABEL: masked_store_bool_mask_demand_trunc_sext:
799; AVX512F: ## %bb.0:
800; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
801; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
802; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
803; AVX512F-NEXT: kshiftlw $12, %k0, %k0
804; AVX512F-NEXT: kshiftrw $12, %k0, %k1
805; AVX512F-NEXT: vmovupd %zmm0, (%rdi) {%k1}
806; AVX512F-NEXT: vzeroupper
807; AVX512F-NEXT: retq
808;
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000809; AVX512VLBW-LABEL: masked_store_bool_mask_demand_trunc_sext:
810; AVX512VLBW: ## %bb.0:
811; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1
812; AVX512VLBW-NEXT: vptestmd %xmm1, %xmm1, %k1
813; AVX512VLBW-NEXT: vmovupd %ymm0, (%rdi) {%k1}
814; AVX512VLBW-NEXT: vzeroupper
815; AVX512VLBW-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000816 %sext = sext <4 x i32> %masksrc to <4 x i64>
817 %boolmask = trunc <4 x i64> %sext to <4 x i1>
818 call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %x, <4 x double>* %p, i32 4, <4 x i1> %boolmask)
819 ret void
820}
821
822; This needs to be widened to v4i32.
823; This used to assert in type legalization. PR38436
824; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask.
825define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000826; SSE2-LABEL: widen_masked_store:
827; SSE2: ## %bb.0:
828; SSE2-NEXT: testb $1, %sil
829; SSE2-NEXT: jne LBB13_1
830; SSE2-NEXT: ## %bb.2: ## %else
831; SSE2-NEXT: testb $1, %dl
832; SSE2-NEXT: jne LBB13_3
833; SSE2-NEXT: LBB13_4: ## %else2
834; SSE2-NEXT: testb $1, %cl
835; SSE2-NEXT: jne LBB13_5
836; SSE2-NEXT: LBB13_6: ## %else4
837; SSE2-NEXT: retq
838; SSE2-NEXT: LBB13_1: ## %cond.store
839; SSE2-NEXT: movd %xmm0, (%rdi)
840; SSE2-NEXT: testb $1, %dl
841; SSE2-NEXT: je LBB13_4
842; SSE2-NEXT: LBB13_3: ## %cond.store1
843; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
844; SSE2-NEXT: movd %xmm1, 4(%rdi)
845; SSE2-NEXT: testb $1, %cl
846; SSE2-NEXT: je LBB13_6
847; SSE2-NEXT: LBB13_5: ## %cond.store3
848; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
849; SSE2-NEXT: movd %xmm0, 8(%rdi)
850; SSE2-NEXT: retq
851;
852; SSE4-LABEL: widen_masked_store:
853; SSE4: ## %bb.0:
854; SSE4-NEXT: testb $1, %sil
855; SSE4-NEXT: jne LBB13_1
856; SSE4-NEXT: ## %bb.2: ## %else
857; SSE4-NEXT: testb $1, %dl
858; SSE4-NEXT: jne LBB13_3
859; SSE4-NEXT: LBB13_4: ## %else2
860; SSE4-NEXT: testb $1, %cl
861; SSE4-NEXT: jne LBB13_5
862; SSE4-NEXT: LBB13_6: ## %else4
863; SSE4-NEXT: retq
864; SSE4-NEXT: LBB13_1: ## %cond.store
865; SSE4-NEXT: movss %xmm0, (%rdi)
866; SSE4-NEXT: testb $1, %dl
867; SSE4-NEXT: je LBB13_4
868; SSE4-NEXT: LBB13_3: ## %cond.store1
869; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi)
870; SSE4-NEXT: testb $1, %cl
871; SSE4-NEXT: je LBB13_6
872; SSE4-NEXT: LBB13_5: ## %cond.store3
873; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi)
874; SSE4-NEXT: retq
875;
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000876; AVX1-LABEL: widen_masked_store:
877; AVX1: ## %bb.0:
878; AVX1-NEXT: vmovd %edx, %xmm1
879; AVX1-NEXT: vmovd %esi, %xmm2
880; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
881; AVX1-NEXT: vmovd %ecx, %xmm2
882; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
883; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
884; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
885; AVX1-NEXT: retq
886;
887; AVX2-LABEL: widen_masked_store:
888; AVX2: ## %bb.0:
889; AVX2-NEXT: vmovd %edx, %xmm1
890; AVX2-NEXT: vmovd %esi, %xmm2
891; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
892; AVX2-NEXT: vmovd %ecx, %xmm2
893; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
894; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
895; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
896; AVX2-NEXT: retq
897;
898; AVX512F-LABEL: widen_masked_store:
899; AVX512F: ## %bb.0:
900; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
901; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
902; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
903; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
904; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
905; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
906; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
907; AVX512F-NEXT: kshiftlw $12, %k0, %k0
908; AVX512F-NEXT: kshiftrw $12, %k0, %k1
909; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
910; AVX512F-NEXT: vzeroupper
911; AVX512F-NEXT: retq
912;
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000913; AVX512VLBW-LABEL: widen_masked_store:
914; AVX512VLBW: ## %bb.0:
915; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1
916; AVX512VLBW-NEXT: vptestmd %xmm1, %xmm1, %k1
917; AVX512VLBW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
918; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm1 {%k1} {z}
919; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
920; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
921; AVX512VLBW-NEXT: vptestmd %xmm1, %xmm1, %k1
922; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
923; AVX512VLBW-NEXT: retq
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000924 call void @llvm.masked.store.v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)
925 ret void
926}
927declare void @llvm.masked.store.v3i32(<3 x i32>, <3 x i32>*, i32, <3 x i1>)
928
929declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
930declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
931declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
932declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
933declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
Simon Pilgrim9d9353a2018-11-14 21:31:50 +0000934declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>)
Simon Pilgrimbe527b52018-11-14 20:44:59 +0000935declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
936declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
937declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
938declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
939declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
940