blob: 6672a836b6c8ace3c01ce8108e3a57a8efcb3e01 [file] [log] [blame]
Yael Tsafrir47668b52017-09-12 07:50:35 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
4
5define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind {
6; AVX512F-LABEL: avg_v16i8_mask:
7; AVX512F: # BB#0:
8; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0
9; AVX512F-NEXT: kmovw %edi, %k1
10; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
11; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
12; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
13; AVX512F-NEXT: vzeroupper
14; AVX512F-NEXT: retq
15;
16; AVX512BWVL-LABEL: avg_v16i8_mask:
17; AVX512BWVL: # BB#0:
18; AVX512BWVL-NEXT: kmovd %edi, %k1
19; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1}
20; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0
21; AVX512BWVL-NEXT: retq
22 %za = zext <16 x i8> %a to <16 x i16>
23 %zb = zext <16 x i8> %b to <16 x i16>
24 %add = add nuw nsw <16 x i16> %za, %zb
25 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
26 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
27 %trunc = trunc <16 x i16> %lshr to <16 x i8>
28 %mask1 = bitcast i16 %mask to <16 x i1>
29 %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src
30 ret <16 x i8> %res
31}
32
33define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind {
34; AVX512F-LABEL: avg_v16i8_maskz:
35; AVX512F: # BB#0:
36; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0
37; AVX512F-NEXT: kmovw %edi, %k1
38; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
39; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
40; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
41; AVX512F-NEXT: vzeroupper
42; AVX512F-NEXT: retq
43;
44; AVX512BWVL-LABEL: avg_v16i8_maskz:
45; AVX512BWVL: # BB#0:
46; AVX512BWVL-NEXT: kmovd %edi, %k1
47; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z}
48; AVX512BWVL-NEXT: retq
49 %za = zext <16 x i8> %a to <16 x i16>
50 %zb = zext <16 x i8> %b to <16 x i16>
51 %add = add nuw nsw <16 x i16> %za, %zb
52 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
53 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
54 %trunc = trunc <16 x i16> %lshr to <16 x i8>
55 %mask1 = bitcast i16 %mask to <16 x i1>
56 %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer
57 ret <16 x i8> %res
58}
59
60define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
61; AVX512F-LABEL: avg_v32i8_mask:
62; AVX512F: # BB#0:
63; AVX512F-NEXT: pushq %rbp
64; AVX512F-NEXT: movq %rsp, %rbp
65; AVX512F-NEXT: andq $-32, %rsp
66; AVX512F-NEXT: subq $32, %rsp
67; AVX512F-NEXT: movl %edi, (%rsp)
68; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
69; AVX512F-NEXT: kmovw (%rsp), %k1
70; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
71; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
72; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
73; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
74; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
75; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
76; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
77; AVX512F-NEXT: movq %rbp, %rsp
78; AVX512F-NEXT: popq %rbp
79; AVX512F-NEXT: retq
80;
81; AVX512BWVL-LABEL: avg_v32i8_mask:
82; AVX512BWVL: # BB#0:
83; AVX512BWVL-NEXT: kmovd %edi, %k1
84; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1}
85; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0
86; AVX512BWVL-NEXT: retq
87 %za = zext <32 x i8> %a to <32 x i16>
88 %zb = zext <32 x i8> %b to <32 x i16>
89 %add = add nuw nsw <32 x i16> %za, %zb
90 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
91 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
92 %trunc = trunc <32 x i16> %lshr to <32 x i8>
93 %mask1 = bitcast i32 %mask to <32 x i1>
94 %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src
95 ret <32 x i8> %res
96}
97
98define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
99; AVX512F-LABEL: avg_v32i8_maskz:
100; AVX512F: # BB#0:
101; AVX512F-NEXT: pushq %rbp
102; AVX512F-NEXT: movq %rsp, %rbp
103; AVX512F-NEXT: andq $-32, %rsp
104; AVX512F-NEXT: subq $32, %rsp
105; AVX512F-NEXT: movl %edi, (%rsp)
106; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
107; AVX512F-NEXT: kmovw (%rsp), %k1
108; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
109; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
110; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
111; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
112; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
113; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
114; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
115; AVX512F-NEXT: movq %rbp, %rsp
116; AVX512F-NEXT: popq %rbp
117; AVX512F-NEXT: retq
118;
119; AVX512BWVL-LABEL: avg_v32i8_maskz:
120; AVX512BWVL: # BB#0:
121; AVX512BWVL-NEXT: kmovd %edi, %k1
122; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z}
123; AVX512BWVL-NEXT: retq
124 %za = zext <32 x i8> %a to <32 x i16>
125 %zb = zext <32 x i8> %b to <32 x i16>
126 %add = add nuw nsw <32 x i16> %za, %zb
127 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
128 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
129 %trunc = trunc <32 x i16> %lshr to <32 x i8>
130 %mask1 = bitcast i32 %mask to <32 x i1>
131 %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer
132 ret <32 x i8> %res
133}
134
135define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
136; AVX512F-LABEL: avg_v64i8_mask:
137; AVX512F: # BB#0:
138; AVX512F-NEXT: pushq %rbp
139; AVX512F-NEXT: movq %rsp, %rbp
140; AVX512F-NEXT: andq $-32, %rsp
141; AVX512F-NEXT: subq $64, %rsp
142; AVX512F-NEXT: movq %rdi, %rax
143; AVX512F-NEXT: shrq $32, %rax
144; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
145; AVX512F-NEXT: movl %edi, (%rsp)
146; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6
147; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm8
148; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7
149; AVX512F-NEXT: vpavgb %xmm7, %xmm6, %xmm6
150; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm7
151; AVX512F-NEXT: vpavgb %xmm7, %xmm8, %xmm7
152; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
153; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1
154; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
155; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
156; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
157; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
158; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
159; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
160; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
161; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
162; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
163; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
164; AVX512F-NEXT: kmovw (%rsp), %k1
165; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
166; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
167; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
168; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
169; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
170; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
171; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
172; AVX512F-NEXT: movq %rbp, %rsp
173; AVX512F-NEXT: popq %rbp
174; AVX512F-NEXT: retq
175;
176; AVX512BWVL-LABEL: avg_v64i8_mask:
177; AVX512BWVL: # BB#0:
178; AVX512BWVL-NEXT: kmovq %rdi, %k1
179; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
180; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0
181; AVX512BWVL-NEXT: retq
182 %za = zext <64 x i8> %a to <64 x i16>
183 %zb = zext <64 x i8> %b to <64 x i16>
184 %add = add nuw nsw <64 x i16> %za, %zb
185 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
186 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
187 %trunc = trunc <64 x i16> %lshr to <64 x i8>
188 %mask1 = bitcast i64 %mask to <64 x i1>
189 %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src
190 ret <64 x i8> %res
191}
192
193define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
194; AVX512F-LABEL: avg_v64i8_maskz:
195; AVX512F: # BB#0:
196; AVX512F-NEXT: pushq %rbp
197; AVX512F-NEXT: movq %rsp, %rbp
198; AVX512F-NEXT: andq $-32, %rsp
199; AVX512F-NEXT: subq $64, %rsp
200; AVX512F-NEXT: movq %rdi, %rax
201; AVX512F-NEXT: shrq $32, %rax
202; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
203; AVX512F-NEXT: movl %edi, (%rsp)
204; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
205; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
206; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
207; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
208; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
209; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
210; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
211; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
212; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
213; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
214; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
215; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
216; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
217; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
218; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
219; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
220; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
221; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
222; AVX512F-NEXT: kmovw (%rsp), %k1
223; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
224; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
225; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
226; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
227; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
228; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
229; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
230; AVX512F-NEXT: movq %rbp, %rsp
231; AVX512F-NEXT: popq %rbp
232; AVX512F-NEXT: retq
233;
234; AVX512BWVL-LABEL: avg_v64i8_maskz:
235; AVX512BWVL: # BB#0:
236; AVX512BWVL-NEXT: kmovq %rdi, %k1
237; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z}
238; AVX512BWVL-NEXT: retq
239 %za = zext <64 x i8> %a to <64 x i16>
240 %zb = zext <64 x i8> %b to <64 x i16>
241 %add = add nuw nsw <64 x i16> %za, %zb
242 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
243 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
244 %trunc = trunc <64 x i16> %lshr to <64 x i8>
245 %mask1 = bitcast i64 %mask to <64 x i1>
246 %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer
247 ret <64 x i8> %res
248}
249
250define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind {
251; AVX512F-LABEL: avg_v8i16_mask:
252; AVX512F: # BB#0:
253; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0
254; AVX512F-NEXT: kmovw %edi, %k1
255; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
256; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
257; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
258; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
259; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
260; AVX512F-NEXT: vzeroupper
261; AVX512F-NEXT: retq
262;
263; AVX512BWVL-LABEL: avg_v8i16_mask:
264; AVX512BWVL: # BB#0:
265; AVX512BWVL-NEXT: kmovd %edi, %k1
266; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1}
267; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0
268; AVX512BWVL-NEXT: retq
269 %za = zext <8 x i16> %a to <8 x i32>
270 %zb = zext <8 x i16> %b to <8 x i32>
271 %add = add nuw nsw <8 x i32> %za, %zb
272 %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
273 %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
274 %trunc = trunc <8 x i32> %lshr to <8 x i16>
275 %mask1 = bitcast i8 %mask to <8 x i1>
276 %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src
277 ret <8 x i16> %res
278}
279
280define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind {
281; AVX512F-LABEL: avg_v8i16_maskz:
282; AVX512F: # BB#0:
283; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0
284; AVX512F-NEXT: kmovw %edi, %k1
285; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
286; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
287; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
288; AVX512F-NEXT: vzeroupper
289; AVX512F-NEXT: retq
290;
291; AVX512BWVL-LABEL: avg_v8i16_maskz:
292; AVX512BWVL: # BB#0:
293; AVX512BWVL-NEXT: kmovd %edi, %k1
294; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z}
295; AVX512BWVL-NEXT: retq
296 %za = zext <8 x i16> %a to <8 x i32>
297 %zb = zext <8 x i16> %b to <8 x i32>
298 %add = add nuw nsw <8 x i32> %za, %zb
299 %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
300 %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
301 %trunc = trunc <8 x i32> %lshr to <8 x i16>
302 %mask1 = bitcast i8 %mask to <8 x i1>
303 %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer
304 ret <8 x i16> %res
305}
306
307define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind {
308; AVX512F-LABEL: avg_v16i16_mask:
309; AVX512F: # BB#0:
310; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
311; AVX512F-NEXT: kmovw %edi, %k1
312; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
313; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
314; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
315; AVX512F-NEXT: vpandn %ymm2, %ymm1, %ymm1
316; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
317; AVX512F-NEXT: retq
318;
319; AVX512BWVL-LABEL: avg_v16i16_mask:
320; AVX512BWVL: # BB#0:
321; AVX512BWVL-NEXT: kmovd %edi, %k1
322; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1}
323; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0
324; AVX512BWVL-NEXT: retq
325 %za = zext <16 x i16> %a to <16 x i32>
326 %zb = zext <16 x i16> %b to <16 x i32>
327 %add = add nuw nsw <16 x i32> %za, %zb
328 %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
329 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
330 %trunc = trunc <16 x i32> %lshr to <16 x i16>
331 %mask1 = bitcast i16 %mask to <16 x i1>
332 %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src
333 ret <16 x i16> %res
334}
335
336define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind {
337; AVX512F-LABEL: avg_v16i16_maskz:
338; AVX512F: # BB#0:
339; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
340; AVX512F-NEXT: kmovw %edi, %k1
341; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
342; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
343; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
344; AVX512F-NEXT: retq
345;
346; AVX512BWVL-LABEL: avg_v16i16_maskz:
347; AVX512BWVL: # BB#0:
348; AVX512BWVL-NEXT: kmovd %edi, %k1
349; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z}
350; AVX512BWVL-NEXT: retq
351 %za = zext <16 x i16> %a to <16 x i32>
352 %zb = zext <16 x i16> %b to <16 x i32>
353 %add = add nuw nsw <16 x i32> %za, %zb
354 %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
355 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
356 %trunc = trunc <16 x i32> %lshr to <16 x i16>
357 %mask1 = bitcast i16 %mask to <16 x i1>
358 %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer
359 ret <16 x i16> %res
360}
361
362define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
363; AVX512F-LABEL: avg_v32i16_mask:
364; AVX512F: # BB#0:
365; AVX512F-NEXT: pushq %rbp
366; AVX512F-NEXT: movq %rsp, %rbp
367; AVX512F-NEXT: andq $-32, %rsp
368; AVX512F-NEXT: subq $32, %rsp
369; AVX512F-NEXT: movl %edi, (%rsp)
370; AVX512F-NEXT: kmovw (%rsp), %k1
371; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
372; AVX512F-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
373; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
374; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
375; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
376; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
377; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
378; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
379; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
380; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
381; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
382; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
383; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
384; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
385; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
386; AVX512F-NEXT: movq %rbp, %rsp
387; AVX512F-NEXT: popq %rbp
388; AVX512F-NEXT: retq
389;
390; AVX512BWVL-LABEL: avg_v32i16_mask:
391; AVX512BWVL: # BB#0:
392; AVX512BWVL-NEXT: kmovd %edi, %k1
393; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
394; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0
395; AVX512BWVL-NEXT: retq
396 %za = zext <32 x i16> %a to <32 x i32>
397 %zb = zext <32 x i16> %b to <32 x i32>
398 %add = add nuw nsw <32 x i32> %za, %zb
399 %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
400 %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
401 %trunc = trunc <32 x i32> %lshr to <32 x i16>
402 %mask1 = bitcast i32 %mask to <32 x i1>
403 %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src
404 ret <32 x i16> %res
405}
406
407define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
408; AVX512F-LABEL: avg_v32i16_maskz:
409; AVX512F: # BB#0:
410; AVX512F-NEXT: pushq %rbp
411; AVX512F-NEXT: movq %rsp, %rbp
412; AVX512F-NEXT: andq $-32, %rsp
413; AVX512F-NEXT: subq $32, %rsp
414; AVX512F-NEXT: movl %edi, (%rsp)
415; AVX512F-NEXT: kmovw (%rsp), %k1
416; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
417; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
418; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
419; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
420; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
421; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
422; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
423; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
424; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
425; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
426; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
427; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
428; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
429; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
430; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
431; AVX512F-NEXT: movq %rbp, %rsp
432; AVX512F-NEXT: popq %rbp
433; AVX512F-NEXT: retq
434;
435; AVX512BWVL-LABEL: avg_v32i16_maskz:
436; AVX512BWVL: # BB#0:
437; AVX512BWVL-NEXT: kmovd %edi, %k1
438; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z}
439; AVX512BWVL-NEXT: retq
440 %za = zext <32 x i16> %a to <32 x i32>
441 %zb = zext <32 x i16> %b to <32 x i32>
442 %add = add nuw nsw <32 x i32> %za, %zb
443 %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
444 %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
445 %trunc = trunc <32 x i32> %lshr to <32 x i16>
446 %mask1 = bitcast i32 %mask to <32 x i1>
447 %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer
448 ret <32 x i16> %res
449}