blob: a2485b7696811d0f36eedf4aeced2b80b91ac4bb [file] [log] [blame]
Craig Topperc1db0db2016-10-21 05:55:40 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=cannonlake | FileCheck %s
3
4; These test cases demonstrate cases where vpermt2/vpermi2 could benefit from being commuted.
5
6declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
7
8define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p) {
9; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
10; CHECK: ## BB#0:
11; CHECK-NEXT: vpermi2d (%rdi), %zmm0, %zmm1
12; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
13; CHECK-NEXT: retq
14 %x2 = load <16 x i32>, <16 x i32>* %x2p
15 %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
16 ret <16 x i32> %res
17}
18
19declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
20
21define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
22; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
23; CHECK: ## BB#0:
24; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1
25; CHECK-NEXT: vmovapd %zmm1, %zmm0
26; CHECK-NEXT: retq
27 %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
28 ret <8 x double> %res
29}
30
31declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
32
33define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
34; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
35; CHECK: ## BB#0:
36; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1
37; CHECK-NEXT: vmovaps %zmm1, %zmm0
38; CHECK-NEXT: retq
39 %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
40 ret <16 x float> %res
41}
42
43declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
44
45define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
46; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
47; CHECK: ## BB#0:
48; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
49; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
50; CHECK-NEXT: retq
51 %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
52 ret <8 x i64> %res
53}
54
55declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
56
57define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
58; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
59; CHECK: ## BB#0:
60; CHECK-NEXT: kmovw %esi, %k1
61; CHECK-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 {%k1} {z}
62; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
63; CHECK-NEXT: retq
64 %x2 = load <16 x i32>, <16 x i32>* %x2p
65 %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
66 ret <16 x i32> %res
67}
68
69declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
70
71define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
72; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
73; CHECK: ## BB#0:
74; CHECK-NEXT: kmovb %esi, %k1
75; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm1 {%k1} {z}
76; CHECK-NEXT: vmovapd %zmm1, %zmm0
77; CHECK-NEXT: retq
78 %x2s = load double, double* %x2ptr
79 %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
80 %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
81 %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
82 ret <8 x double> %res
83}
84
85declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
86
87define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
88; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
89; CHECK: ## BB#0:
90; CHECK-NEXT: kmovw %edi, %k1
91; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
92; CHECK-NEXT: vmovaps %zmm1, %zmm0
93; CHECK-NEXT: retq
94 %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
95 ret <16 x float> %res
96}
97
98
99declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
100
101define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
102; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
103; CHECK: ## BB#0:
104; CHECK-NEXT: kmovb %edi, %k1
105; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
106; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
107; CHECK-NEXT: retq
108 %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
109 ret <8 x i64> %res
110}
111
112declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
113
114define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
115; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
116; CHECK: ## BB#0:
117; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm1
118; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
119; CHECK-NEXT: retq
120 %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
121 ret <16 x i32> %res
122}
123
124declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
125
126define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
127; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128:
128; CHECK: ## BB#0:
129; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1
130; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
131; CHECK-NEXT: retq
132 %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
133 ret <4 x i32> %res
134}
135
136declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
137
138define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
139; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128:
140; CHECK: ## BB#0:
141; CHECK-NEXT: kmovb %edi, %k1
142; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z}
143; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
144; CHECK-NEXT: retq
145 %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
146 ret <4 x i32> %res
147}
148
149define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast(<4 x i32> %x0, <4 x i32> %x1, i32* %x2ptr, i8 %x3) {
150; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast:
151; CHECK: ## BB#0:
152; CHECK-NEXT: kmovb %esi, %k1
153; CHECK-NEXT: vpermt2d (%rdi){1to4}, %xmm0, %xmm1 {%k1} {z}
154; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
155; CHECK-NEXT: retq
156 %x2s = load i32, i32* %x2ptr
157 %x2ins = insertelement <4 x i32> undef, i32 %x2s, i32 0
158 %x2 = shufflevector <4 x i32> %x2ins, <4 x i32> undef, <4 x i32> zeroinitializer
159 %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
160 ret <4 x i32> %res
161}
162
163declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
164
165define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
166; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256:
167; CHECK: ## BB#0:
168; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1
169; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
170; CHECK-NEXT: retq
171 %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
172 ret <8 x i32> %res
173}
174
175declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
176
177define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
178; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256:
179; CHECK: ## BB#0:
180; CHECK-NEXT: kmovb %edi, %k1
181; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z}
182; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
183; CHECK-NEXT: retq
184 %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
185 ret <8 x i32> %res
186}
187
188declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
189
190define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) {
191; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128:
192; CHECK: ## BB#0:
193; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1
194; CHECK-NEXT: vmovapd %xmm1, %xmm0
195; CHECK-NEXT: retq
196 %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
197 ret <2 x double> %res
198}
199
200declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
201
202define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) {
203; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256:
204; CHECK: ## BB#0:
205; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
206; CHECK-NEXT: vmovapd %ymm1, %ymm0
207; CHECK-NEXT: retq
208 %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
209 ret <4 x double> %res
210}
211
212declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
213
214define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) {
215; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128:
216; CHECK: ## BB#0:
217; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1
218; CHECK-NEXT: vmovaps %xmm1, %xmm0
219; CHECK-NEXT: retq
220 %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
221 ret <4 x float> %res
222}
223
224declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
225
226define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) {
227; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256:
228; CHECK: ## BB#0:
229; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1
230; CHECK-NEXT: vmovaps %ymm1, %ymm0
231; CHECK-NEXT: retq
232 %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
233 ret <8 x float> %res
234}
235
236define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_load(<8 x float> %x0, <8 x i32> %x1, <8 x float>* %x2p) {
237; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256_load:
238; CHECK: ## BB#0:
239; CHECK-NEXT: vpermi2ps (%rdi), %ymm0, %ymm1
240; CHECK-NEXT: vmovaps %ymm1, %ymm0
241; CHECK-NEXT: retq
242 %x2 = load <8 x float>, <8 x float>* %x2p
243 %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
244 ret <8 x float> %res
245}
246
247define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_broadcast(<8 x float> %x0, <8 x i32> %x1, float* %x2ptr) {
248; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256_broadcast:
249; CHECK: ## BB#0:
250; CHECK-NEXT: vpermi2ps (%rdi){1to8}, %ymm0, %ymm1
251; CHECK-NEXT: vmovaps %ymm1, %ymm0
252; CHECK-NEXT: retq
253 %x2s = load float, float* %x2ptr
254 %x2ins = insertelement <8 x float> undef, float %x2s, i32 0
255 %x2 = shufflevector <8 x float> %x2ins, <8 x float> undef, <8 x i32> zeroinitializer
256 %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
257 ret <8 x float> %res
258}
259
260declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
261
262define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
263; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128:
264; CHECK: ## BB#0:
265; CHECK-NEXT: vpermi2b %xmm2, %xmm0, %xmm1
266; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
267; CHECK-NEXT: retq
268 %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
269 ret <16 x i8> %res
270}
271
272declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
273
274define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
275; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256:
276; CHECK: ## BB#0:
277; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
278; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
279; CHECK-NEXT: retq
280 %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
281 ret <32 x i8> %res
282}
283
284declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
285
286define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
287; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128:
288; CHECK: ## BB#0:
289; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1
290; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
291; CHECK-NEXT: retq
292 %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
293 ret <16 x i8> %res
294}
295
296define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128_load(<16 x i8> %x0, <16 x i8> %x1, <16 x i8>* %x2p) {
297; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128_load:
298; CHECK: ## BB#0:
299; CHECK-NEXT: vpermt2b (%rdi), %xmm0, %xmm1
300; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
301; CHECK-NEXT: retq
302 %x2 = load <16 x i8>, <16 x i8>* %x2p
303 %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
304 ret <16 x i8> %res
305}
306
307declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
308
309define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
310; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256:
311; CHECK: ## BB#0:
312; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1
313; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
314; CHECK-NEXT: retq
315 %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
316 ret <32 x i8> %res
317}
318
319declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
320
321define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
322; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
323; CHECK: ## BB#0:
324; CHECK-NEXT: kmovw %edi, %k1
325; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} {z}
326; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
327; CHECK-NEXT: retq
328 %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
329 ret <16 x i8> %res
330}
331
332define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128_load(<16 x i8> %x0, <16 x i8> %x1, <16 x i8>* %x2p, i16 %x3) {
333; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128_load:
334; CHECK: ## BB#0:
335; CHECK-NEXT: kmovw %esi, %k1
336; CHECK-NEXT: vpermt2b (%rdi), %xmm0, %xmm1 {%k1} {z}
337; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
338; CHECK-NEXT: retq
339 %x2 = load <16 x i8>, <16 x i8>* %x2p
340 %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
341 ret <16 x i8> %res
342}
343
344declare <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
345
346define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
347; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256:
348; CHECK: ## BB#0:
349; CHECK-NEXT: kmovd %edi, %k1
350; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} {z}
351; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
352; CHECK-NEXT: retq
353 %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
354 ret <32 x i8> %res
355}
356
357define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256_load(<32 x i8> %x0, <32 x i8> %x1, <32 x i8>* %x2p, i32 %x3) {
358; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256_load:
359; CHECK: ## BB#0:
360; CHECK-NEXT: kmovd %esi, %k1
361; CHECK-NEXT: vpermt2b (%rdi), %ymm0, %ymm1 {%k1} {z}
362; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
363; CHECK-NEXT: retq
364 %x2 = load <32 x i8>, <32 x i8>* %x2p
365 %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
366 ret <32 x i8> %res
367}