blob: f1c636a7330518f3adad99f61ccd43bb21a8fe61 [file] [log] [blame]
Cong Houbed60d32015-11-24 05:44:19 +00001; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
Cong Houeb9c7052015-11-30 21:46:08 +00002; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW
Cong Houbed60d32015-11-24 05:44:19 +00004
5define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +00006; SSE2-LABEL: avg_v4i8:
7; SSE2: # BB#0:
8; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
9; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
10; SSE2-NEXT: pavgb %xmm0, %xmm1
11; SSE2-NEXT: movd %xmm1, (%rax)
12; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +000013;
Cong Houeb9c7052015-11-30 21:46:08 +000014; AVX2-LABEL: avg_v4i8:
Cong Houbed60d32015-11-24 05:44:19 +000015; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +000016; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
17; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
18; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
19; AVX2-NEXT: vmovd %xmm0, (%rax)
20; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +000021;
Cong Houeb9c7052015-11-30 21:46:08 +000022; AVX512BW-LABEL: avg_v4i8:
23; AVX512BW: # BB#0:
24; AVX512BW-NEXT: vmovd (%rdi), %xmm0
25; AVX512BW-NEXT: vmovd (%rsi), %xmm1
26; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0
27; AVX512BW-NEXT: vmovd %xmm0, (%rax)
28; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +000029 %1 = load <4 x i8>, <4 x i8>* %a
30 %2 = load <4 x i8>, <4 x i8>* %b
31 %3 = zext <4 x i8> %1 to <4 x i32>
32 %4 = zext <4 x i8> %2 to <4 x i32>
33 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
34 %6 = add nuw nsw <4 x i32> %5, %4
35 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
36 %8 = trunc <4 x i32> %7 to <4 x i8>
37 store <4 x i8> %8, <4 x i8>* undef, align 4
38 ret void
39}
40
41define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +000042; SSE2-LABEL: avg_v8i8:
Cong Houbed60d32015-11-24 05:44:19 +000043; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +000044; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
45; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
46; SSE2-NEXT: pavgb %xmm0, %xmm1
47; SSE2-NEXT: movq %xmm1, (%rax)
48; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +000049;
Cong Houeb9c7052015-11-30 21:46:08 +000050; AVX2-LABEL: avg_v8i8:
Cong Houbed60d32015-11-24 05:44:19 +000051; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +000052; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
53; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
54; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
55; AVX2-NEXT: vmovq %xmm0, (%rax)
56; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +000057;
Cong Houeb9c7052015-11-30 21:46:08 +000058; AVX512BW-LABEL: avg_v8i8:
59; AVX512BW: # BB#0:
60; AVX512BW-NEXT: vmovq (%rdi), %xmm0
61; AVX512BW-NEXT: vmovq (%rsi), %xmm1
62; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0
63; AVX512BW-NEXT: vmovq %xmm0, (%rax)
64; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +000065 %1 = load <8 x i8>, <8 x i8>* %a
66 %2 = load <8 x i8>, <8 x i8>* %b
67 %3 = zext <8 x i8> %1 to <8 x i32>
68 %4 = zext <8 x i8> %2 to <8 x i32>
69 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
70 %6 = add nuw nsw <8 x i32> %5, %4
71 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
72 %8 = trunc <8 x i32> %7 to <8 x i8>
73 store <8 x i8> %8, <8 x i8>* undef, align 4
74 ret void
75}
76
77define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +000078; SSE2-LABEL: avg_v16i8:
Cong Houbed60d32015-11-24 05:44:19 +000079; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +000080; SSE2-NEXT: movdqa (%rsi), %xmm0
81; SSE2-NEXT: pavgb (%rdi), %xmm0
82; SSE2-NEXT: movdqu %xmm0, (%rax)
83; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +000084;
Cong Houeb9c7052015-11-30 21:46:08 +000085; AVX-LABEL: avg_v16i8:
86; AVX: # BB#0:
87; AVX-NEXT: vmovdqa (%rsi), %xmm0
88; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
89; AVX-NEXT: vmovdqu %xmm0, (%rax)
90; AVX-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +000091 %1 = load <16 x i8>, <16 x i8>* %a
92 %2 = load <16 x i8>, <16 x i8>* %b
93 %3 = zext <16 x i8> %1 to <16 x i32>
94 %4 = zext <16 x i8> %2 to <16 x i32>
95 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
96 %6 = add nuw nsw <16 x i32> %5, %4
97 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
98 %8 = trunc <16 x i32> %7 to <16 x i8>
99 store <16 x i8> %8, <16 x i8>* undef, align 4
100 ret void
101}
102
103define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000104; AVX2-LABEL: avg_v32i8:
Cong Houbed60d32015-11-24 05:44:19 +0000105; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000106; AVX2-NEXT: vmovdqa (%rsi), %ymm0
107; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
108; AVX2-NEXT: vmovdqu %ymm0, (%rax)
109; AVX2-NEXT: vzeroupper
110; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000111;
Cong Houeb9c7052015-11-30 21:46:08 +0000112; AVX512BW-LABEL: avg_v32i8:
113; AVX512BW: # BB#0:
114; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0
115; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0
116; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
117; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000118 %1 = load <32 x i8>, <32 x i8>* %a
119 %2 = load <32 x i8>, <32 x i8>* %b
120 %3 = zext <32 x i8> %1 to <32 x i32>
121 %4 = zext <32 x i8> %2 to <32 x i32>
122 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
123 %6 = add nuw nsw <32 x i32> %5, %4
124 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
125 %8 = trunc <32 x i32> %7 to <32 x i8>
126 store <32 x i8> %8, <32 x i8>* undef, align 4
127 ret void
128}
129
130define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000131; AVX512BW-LABEL: avg_v64i8:
132; AVX512BW: # BB#0:
133; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
134; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
135; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
136; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000137 %1 = load <64 x i8>, <64 x i8>* %a
138 %2 = load <64 x i8>, <64 x i8>* %b
139 %3 = zext <64 x i8> %1 to <64 x i32>
140 %4 = zext <64 x i8> %2 to <64 x i32>
141 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
142 %6 = add nuw nsw <64 x i32> %5, %4
143 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
144 %8 = trunc <64 x i32> %7 to <64 x i8>
145 store <64 x i8> %8, <64 x i8>* undef, align 4
146 ret void
147}
148
149define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000150; SSE2-LABEL: avg_v4i16:
Cong Houbed60d32015-11-24 05:44:19 +0000151; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000152; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
153; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
154; SSE2-NEXT: pavgw %xmm0, %xmm1
155; SSE2-NEXT: movq %xmm1, (%rax)
156; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000157;
Cong Houeb9c7052015-11-30 21:46:08 +0000158; AVX2-LABEL: avg_v4i16:
159; AVX2: # BB#0:
160; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
161; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
162; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0
163; AVX2-NEXT: vmovq %xmm0, (%rax)
164; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000165;
Cong Houeb9c7052015-11-30 21:46:08 +0000166; AVX512BW-LABEL: avg_v4i16:
167; AVX512BW: # BB#0:
168; AVX512BW-NEXT: vmovq (%rdi), %xmm0
169; AVX512BW-NEXT: vmovq (%rsi), %xmm1
170; AVX512BW-NEXT: vpavgw %xmm0, %xmm1, %xmm0
171; AVX512BW-NEXT: vmovq %xmm0, (%rax)
172; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000173 %1 = load <4 x i16>, <4 x i16>* %a
174 %2 = load <4 x i16>, <4 x i16>* %b
175 %3 = zext <4 x i16> %1 to <4 x i32>
176 %4 = zext <4 x i16> %2 to <4 x i32>
177 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
178 %6 = add nuw nsw <4 x i32> %5, %4
179 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
180 %8 = trunc <4 x i32> %7 to <4 x i16>
181 store <4 x i16> %8, <4 x i16>* undef, align 4
182 ret void
183}
184
185define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000186; SSE2-LABEL: avg_v8i16:
Cong Houbed60d32015-11-24 05:44:19 +0000187; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000188; SSE2-NEXT: movdqa (%rsi), %xmm0
189; SSE2-NEXT: pavgw (%rdi), %xmm0
190; SSE2-NEXT: movdqu %xmm0, (%rax)
191; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000192;
Cong Houeb9c7052015-11-30 21:46:08 +0000193; AVX-LABEL: avg_v8i16:
194; AVX: # BB#0:
195; AVX-NEXT: vmovdqa (%rsi), %xmm0
196; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
197; AVX-NEXT: vmovdqu %xmm0, (%rax)
198; AVX-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000199 %1 = load <8 x i16>, <8 x i16>* %a
200 %2 = load <8 x i16>, <8 x i16>* %b
201 %3 = zext <8 x i16> %1 to <8 x i32>
202 %4 = zext <8 x i16> %2 to <8 x i32>
203 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
204 %6 = add nuw nsw <8 x i32> %5, %4
205 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
206 %8 = trunc <8 x i32> %7 to <8 x i16>
207 store <8 x i16> %8, <8 x i16>* undef, align 4
208 ret void
209}
210
211define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000212; AVX2-LABEL: avg_v16i16:
Cong Houbed60d32015-11-24 05:44:19 +0000213; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000214; AVX2-NEXT: vmovdqa (%rsi), %ymm0
215; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
216; AVX2-NEXT: vmovdqu %ymm0, (%rax)
217; AVX2-NEXT: vzeroupper
218; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000219;
Cong Houeb9c7052015-11-30 21:46:08 +0000220; AVX512BW-LABEL: avg_v16i16:
221; AVX512BW: # BB#0:
222; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0
223; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0
224; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
225; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000226 %1 = load <16 x i16>, <16 x i16>* %a
227 %2 = load <16 x i16>, <16 x i16>* %b
228 %3 = zext <16 x i16> %1 to <16 x i32>
229 %4 = zext <16 x i16> %2 to <16 x i32>
230 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
231 %6 = add nuw nsw <16 x i32> %5, %4
232 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
233 %8 = trunc <16 x i32> %7 to <16 x i16>
234 store <16 x i16> %8, <16 x i16>* undef, align 4
235 ret void
236}
237
238define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000239; AVX512BW-LABEL: avg_v32i16:
240; AVX512BW: # BB#0:
241; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
242; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
243; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
244; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000245 %1 = load <32 x i16>, <32 x i16>* %a
246 %2 = load <32 x i16>, <32 x i16>* %b
247 %3 = zext <32 x i16> %1 to <32 x i32>
248 %4 = zext <32 x i16> %2 to <32 x i32>
249 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
250 %6 = add nuw nsw <32 x i32> %5, %4
251 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
252 %8 = trunc <32 x i32> %7 to <32 x i16>
253 store <32 x i16> %8, <32 x i16>* undef, align 4
254 ret void
255}
256
257define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000258; SSE2-LABEL: avg_v4i8_2:
Cong Houbed60d32015-11-24 05:44:19 +0000259; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000260; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
261; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
262; SSE2-NEXT: pavgb %xmm0, %xmm1
263; SSE2-NEXT: movd %xmm1, (%rax)
264; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000265;
Cong Houeb9c7052015-11-30 21:46:08 +0000266; AVX2-LABEL: avg_v4i8_2:
267; AVX2: # BB#0:
268; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
269; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
270; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
271; AVX2-NEXT: vmovd %xmm0, (%rax)
272; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000273;
Cong Houeb9c7052015-11-30 21:46:08 +0000274; AVX512BW-LABEL: avg_v4i8_2:
275; AVX512BW: # BB#0:
276; AVX512BW-NEXT: vmovd (%rdi), %xmm0
277; AVX512BW-NEXT: vmovd (%rsi), %xmm1
278; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0
279; AVX512BW-NEXT: vmovd %xmm0, (%rax)
280; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000281 %1 = load <4 x i8>, <4 x i8>* %a
282 %2 = load <4 x i8>, <4 x i8>* %b
283 %3 = zext <4 x i8> %1 to <4 x i32>
284 %4 = zext <4 x i8> %2 to <4 x i32>
285 %5 = add nuw nsw <4 x i32> %3, %4
286 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
287 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
288 %8 = trunc <4 x i32> %7 to <4 x i8>
289 store <4 x i8> %8, <4 x i8>* undef, align 4
290 ret void
291}
292
293define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000294; SSE2-LABEL: avg_v8i8_2:
Cong Houbed60d32015-11-24 05:44:19 +0000295; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000296; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
297; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
298; SSE2-NEXT: pavgb %xmm0, %xmm1
299; SSE2-NEXT: movq %xmm1, (%rax)
300; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000301;
Cong Houeb9c7052015-11-30 21:46:08 +0000302; AVX2-LABEL: avg_v8i8_2:
Cong Houbed60d32015-11-24 05:44:19 +0000303; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000304; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
305; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
306; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
307; AVX2-NEXT: vmovq %xmm0, (%rax)
308; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000309;
Cong Houeb9c7052015-11-30 21:46:08 +0000310; AVX512BW-LABEL: avg_v8i8_2:
311; AVX512BW: # BB#0:
312; AVX512BW-NEXT: vmovq (%rdi), %xmm0
313; AVX512BW-NEXT: vmovq (%rsi), %xmm1
314; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0
315; AVX512BW-NEXT: vmovq %xmm0, (%rax)
316; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000317 %1 = load <8 x i8>, <8 x i8>* %a
318 %2 = load <8 x i8>, <8 x i8>* %b
319 %3 = zext <8 x i8> %1 to <8 x i32>
320 %4 = zext <8 x i8> %2 to <8 x i32>
321 %5 = add nuw nsw <8 x i32> %3, %4
322 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
323 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
324 %8 = trunc <8 x i32> %7 to <8 x i8>
325 store <8 x i8> %8, <8 x i8>* undef, align 4
326 ret void
327}
328
329define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000330; SSE2-LABEL: avg_v16i8_2:
Cong Houbed60d32015-11-24 05:44:19 +0000331; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000332; SSE2-NEXT: movdqa (%rdi), %xmm0
333; SSE2-NEXT: pavgb (%rsi), %xmm0
334; SSE2-NEXT: movdqu %xmm0, (%rax)
335; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000336;
Cong Houeb9c7052015-11-30 21:46:08 +0000337; AVX-LABEL: avg_v16i8_2:
338; AVX: # BB#0:
339; AVX-NEXT: vmovdqa (%rdi), %xmm0
340; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
341; AVX-NEXT: vmovdqu %xmm0, (%rax)
342; AVX-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000343 %1 = load <16 x i8>, <16 x i8>* %a
344 %2 = load <16 x i8>, <16 x i8>* %b
345 %3 = zext <16 x i8> %1 to <16 x i32>
346 %4 = zext <16 x i8> %2 to <16 x i32>
347 %5 = add nuw nsw <16 x i32> %3, %4
348 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
349 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
350 %8 = trunc <16 x i32> %7 to <16 x i8>
351 store <16 x i8> %8, <16 x i8>* undef, align 4
352 ret void
353}
354
355define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000356; AVX2-LABEL: avg_v32i8_2:
Cong Houbed60d32015-11-24 05:44:19 +0000357; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000358; AVX2-NEXT: vmovdqa (%rdi), %ymm0
359; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
360; AVX2-NEXT: vmovdqu %ymm0, (%rax)
361; AVX2-NEXT: vzeroupper
362; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000363;
Cong Houeb9c7052015-11-30 21:46:08 +0000364; AVX512BW-LABEL: avg_v32i8_2:
365; AVX512BW: # BB#0:
366; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
367; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0
368; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
369; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000370 %1 = load <32 x i8>, <32 x i8>* %a
371 %2 = load <32 x i8>, <32 x i8>* %b
372 %3 = zext <32 x i8> %1 to <32 x i32>
373 %4 = zext <32 x i8> %2 to <32 x i32>
374 %5 = add nuw nsw <32 x i32> %3, %4
375 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
376 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
377 %8 = trunc <32 x i32> %7 to <32 x i8>
378 store <32 x i8> %8, <32 x i8>* undef, align 4
379 ret void
380}
381
382define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000383; AVX512BW-LABEL: avg_v64i8_2:
Cong Houbed60d32015-11-24 05:44:19 +0000384; AVX512BW: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000385; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
386; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
387; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
388; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000389 %1 = load <64 x i8>, <64 x i8>* %a
390 %2 = load <64 x i8>, <64 x i8>* %b
391 %3 = zext <64 x i8> %1 to <64 x i32>
392 %4 = zext <64 x i8> %2 to <64 x i32>
393 %5 = add nuw nsw <64 x i32> %4, %4
394 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
395 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
396 %8 = trunc <64 x i32> %7 to <64 x i8>
397 store <64 x i8> %8, <64 x i8>* undef, align 4
398 ret void
399}
400
401
402define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000403; SSE2-LABEL: avg_v4i16_2:
Cong Houbed60d32015-11-24 05:44:19 +0000404; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000405; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
406; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
407; SSE2-NEXT: pavgw %xmm0, %xmm1
408; SSE2-NEXT: movq %xmm1, (%rax)
409; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000410;
Cong Houeb9c7052015-11-30 21:46:08 +0000411; AVX2-LABEL: avg_v4i16_2:
Cong Houbed60d32015-11-24 05:44:19 +0000412; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000413; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
414; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
415; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0
416; AVX2-NEXT: vmovq %xmm0, (%rax)
417; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000418;
Cong Houeb9c7052015-11-30 21:46:08 +0000419; AVX512BW-LABEL: avg_v4i16_2:
420; AVX512BW: # BB#0:
421; AVX512BW-NEXT: vmovq (%rdi), %xmm0
422; AVX512BW-NEXT: vmovq (%rsi), %xmm1
423; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0
424; AVX512BW-NEXT: vmovq %xmm0, (%rax)
425; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000426 %1 = load <4 x i16>, <4 x i16>* %a
427 %2 = load <4 x i16>, <4 x i16>* %b
428 %3 = zext <4 x i16> %1 to <4 x i32>
429 %4 = zext <4 x i16> %2 to <4 x i32>
430 %5 = add nuw nsw <4 x i32> %3, %4
431 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
432 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
433 %8 = trunc <4 x i32> %7 to <4 x i16>
434 store <4 x i16> %8, <4 x i16>* undef, align 4
435 ret void
436}
437
438define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000439; SSE2-LABEL: avg_v8i16_2:
Cong Houbed60d32015-11-24 05:44:19 +0000440; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000441; SSE2-NEXT: movdqa (%rdi), %xmm0
442; SSE2-NEXT: pavgw (%rsi), %xmm0
443; SSE2-NEXT: movdqu %xmm0, (%rax)
444; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000445;
Cong Houeb9c7052015-11-30 21:46:08 +0000446; AVX-LABEL: avg_v8i16_2:
447; AVX: # BB#0:
448; AVX-NEXT: vmovdqa (%rdi), %xmm0
449; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
450; AVX-NEXT: vmovdqu %xmm0, (%rax)
451; AVX-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000452 %1 = load <8 x i16>, <8 x i16>* %a
453 %2 = load <8 x i16>, <8 x i16>* %b
454 %3 = zext <8 x i16> %1 to <8 x i32>
455 %4 = zext <8 x i16> %2 to <8 x i32>
456 %5 = add nuw nsw <8 x i32> %3, %4
457 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
458 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
459 %8 = trunc <8 x i32> %7 to <8 x i16>
460 store <8 x i16> %8, <8 x i16>* undef, align 4
461 ret void
462}
463
464define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000465; AVX2-LABEL: avg_v16i16_2:
Cong Houbed60d32015-11-24 05:44:19 +0000466; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000467; AVX2-NEXT: vmovdqa (%rdi), %ymm0
468; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
469; AVX2-NEXT: vmovdqu %ymm0, (%rax)
470; AVX2-NEXT: vzeroupper
471; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000472;
Cong Houeb9c7052015-11-30 21:46:08 +0000473; AVX512BW-LABEL: avg_v16i16_2:
474; AVX512BW: # BB#0:
475; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
476; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0
477; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
478; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000479 %1 = load <16 x i16>, <16 x i16>* %a
480 %2 = load <16 x i16>, <16 x i16>* %b
481 %3 = zext <16 x i16> %1 to <16 x i32>
482 %4 = zext <16 x i16> %2 to <16 x i32>
483 %5 = add nuw nsw <16 x i32> %3, %4
484 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
485 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
486 %8 = trunc <16 x i32> %7 to <16 x i16>
487 store <16 x i16> %8, <16 x i16>* undef, align 4
488 ret void
489}
490
491define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
Cong Houeb9c7052015-11-30 21:46:08 +0000492; AVX512BW-LABEL: avg_v32i16_2:
Cong Houbed60d32015-11-24 05:44:19 +0000493; AVX512BW: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000494; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
495; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
496; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
497; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000498 %1 = load <32 x i16>, <32 x i16>* %a
499 %2 = load <32 x i16>, <32 x i16>* %b
500 %3 = zext <32 x i16> %1 to <32 x i32>
501 %4 = zext <32 x i16> %2 to <32 x i32>
502 %5 = add nuw nsw <32 x i32> %3, %4
503 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
504 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
505 %8 = trunc <32 x i32> %7 to <32 x i16>
506 store <32 x i16> %8, <32 x i16>* undef, align 4
507 ret void
508}
509
510define void @avg_v4i8_const(<4 x i8>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000511; SSE2-LABEL: avg_v4i8_const:
Cong Houbed60d32015-11-24 05:44:19 +0000512; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000513; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
514; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
515; SSE2-NEXT: movd %xmm0, (%rax)
516; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000517;
Cong Houeb9c7052015-11-30 21:46:08 +0000518; AVX2-LABEL: avg_v4i8_const:
Cong Houbed60d32015-11-24 05:44:19 +0000519; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000520; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
521; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
522; AVX2-NEXT: vmovd %xmm0, (%rax)
523; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000524;
Cong Houeb9c7052015-11-30 21:46:08 +0000525; AVX512BW-LABEL: avg_v4i8_const:
526; AVX512BW: # BB#0:
527; AVX512BW-NEXT: vmovd (%rdi), %xmm0
528; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
529; AVX512BW-NEXT: vmovd %xmm0, (%rax)
530; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000531 %1 = load <4 x i8>, <4 x i8>* %a
532 %2 = zext <4 x i8> %1 to <4 x i32>
533 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
534 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
535 %5 = trunc <4 x i32> %4 to <4 x i8>
536 store <4 x i8> %5, <4 x i8>* undef, align 4
537 ret void
538}
539
540define void @avg_v8i8_const(<8 x i8>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000541; SSE2-LABEL: avg_v8i8_const:
Cong Houbed60d32015-11-24 05:44:19 +0000542; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000543; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
544; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
545; SSE2-NEXT: movq %xmm0, (%rax)
546; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000547;
Cong Houeb9c7052015-11-30 21:46:08 +0000548; AVX2-LABEL: avg_v8i8_const:
Cong Houbed60d32015-11-24 05:44:19 +0000549; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000550; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
551; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
552; AVX2-NEXT: vmovq %xmm0, (%rax)
553; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000554;
Cong Houeb9c7052015-11-30 21:46:08 +0000555; AVX512BW-LABEL: avg_v8i8_const:
556; AVX512BW: # BB#0:
557; AVX512BW-NEXT: vmovq (%rdi), %xmm0
558; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
559; AVX512BW-NEXT: vmovq %xmm0, (%rax)
560; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000561 %1 = load <8 x i8>, <8 x i8>* %a
562 %2 = zext <8 x i8> %1 to <8 x i32>
563 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
564 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
565 %5 = trunc <8 x i32> %4 to <8 x i8>
566 store <8 x i8> %5, <8 x i8>* undef, align 4
567 ret void
568}
569
570define void @avg_v16i8_const(<16 x i8>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000571; SSE2-LABEL: avg_v16i8_const:
Cong Houbed60d32015-11-24 05:44:19 +0000572; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000573; SSE2-NEXT: movdqa (%rdi), %xmm0
574; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
575; SSE2-NEXT: movdqu %xmm0, (%rax)
576; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000577;
Cong Houeb9c7052015-11-30 21:46:08 +0000578; AVX-LABEL: avg_v16i8_const:
579; AVX: # BB#0:
580; AVX-NEXT: vmovdqa (%rdi), %xmm0
581; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
582; AVX-NEXT: vmovdqu %xmm0, (%rax)
583; AVX-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000584 %1 = load <16 x i8>, <16 x i8>* %a
585 %2 = zext <16 x i8> %1 to <16 x i32>
586 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
587 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
588 %5 = trunc <16 x i32> %4 to <16 x i8>
589 store <16 x i8> %5, <16 x i8>* undef, align 4
590 ret void
591}
592
593define void @avg_v32i8_const(<32 x i8>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000594; AVX2-LABEL: avg_v32i8_const:
Cong Houbed60d32015-11-24 05:44:19 +0000595; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000596; AVX2-NEXT: vmovdqa (%rdi), %ymm0
597; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
598; AVX2-NEXT: vmovdqu %ymm0, (%rax)
599; AVX2-NEXT: vzeroupper
600; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000601;
Cong Houeb9c7052015-11-30 21:46:08 +0000602; AVX512BW-LABEL: avg_v32i8_const:
603; AVX512BW: # BB#0:
604; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
605; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
606; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
607; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000608 %1 = load <32 x i8>, <32 x i8>* %a
609 %2 = zext <32 x i8> %1 to <32 x i32>
610 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
Cong Houeb9c7052015-11-30 21:46:08 +0000611 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
Cong Houbed60d32015-11-24 05:44:19 +0000612 %5 = trunc <32 x i32> %4 to <32 x i8>
613 store <32 x i8> %5, <32 x i8>* undef, align 4
614 ret void
615}
616
617define void @avg_v64i8_const(<64 x i8>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000618; AVX512BW-LABEL: avg_v64i8_const:
619; AVX512BW: # BB#0:
620; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
621; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
622; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
623; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000624 %1 = load <64 x i8>, <64 x i8>* %a
625 %2 = zext <64 x i8> %1 to <64 x i32>
626 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
Cong Houeb9c7052015-11-30 21:46:08 +0000627 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
Cong Houbed60d32015-11-24 05:44:19 +0000628 %5 = trunc <64 x i32> %4 to <64 x i8>
629 store <64 x i8> %5, <64 x i8>* undef, align 4
630 ret void
631}
632
633define void @avg_v4i16_const(<4 x i16>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000634; SSE2-LABEL: avg_v4i16_const:
Cong Houbed60d32015-11-24 05:44:19 +0000635; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000636; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
637; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
638; SSE2-NEXT: movq %xmm0, (%rax)
639; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000640;
Cong Houeb9c7052015-11-30 21:46:08 +0000641; AVX2-LABEL: avg_v4i16_const:
Cong Houbed60d32015-11-24 05:44:19 +0000642; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000643; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
644; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
645; AVX2-NEXT: vmovq %xmm0, (%rax)
646; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000647;
Cong Houeb9c7052015-11-30 21:46:08 +0000648; AVX512BW-LABEL: avg_v4i16_const:
649; AVX512BW: # BB#0:
650; AVX512BW-NEXT: vmovq (%rdi), %xmm0
651; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
652; AVX512BW-NEXT: vmovq %xmm0, (%rax)
653; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000654 %1 = load <4 x i16>, <4 x i16>* %a
655 %2 = zext <4 x i16> %1 to <4 x i32>
656 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
657 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
658 %5 = trunc <4 x i32> %4 to <4 x i16>
659 store <4 x i16> %5, <4 x i16>* undef, align 4
660 ret void
661}
662
663define void @avg_v8i16_const(<8 x i16>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000664; SSE2-LABEL: avg_v8i16_const:
Cong Houbed60d32015-11-24 05:44:19 +0000665; SSE2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000666; SSE2-NEXT: movdqa (%rdi), %xmm0
667; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
668; SSE2-NEXT: movdqu %xmm0, (%rax)
669; SSE2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000670;
Cong Houeb9c7052015-11-30 21:46:08 +0000671; AVX-LABEL: avg_v8i16_const:
672; AVX: # BB#0:
673; AVX-NEXT: vmovdqa (%rdi), %xmm0
674; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
675; AVX-NEXT: vmovdqu %xmm0, (%rax)
676; AVX-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000677 %1 = load <8 x i16>, <8 x i16>* %a
678 %2 = zext <8 x i16> %1 to <8 x i32>
679 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
680 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
681 %5 = trunc <8 x i32> %4 to <8 x i16>
682 store <8 x i16> %5, <8 x i16>* undef, align 4
683 ret void
684}
685
686define void @avg_v16i16_const(<16 x i16>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000687; AVX2-LABEL: avg_v16i16_const:
Cong Houbed60d32015-11-24 05:44:19 +0000688; AVX2: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000689; AVX2-NEXT: vmovdqa (%rdi), %ymm0
690; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
691; AVX2-NEXT: vmovdqu %ymm0, (%rax)
692; AVX2-NEXT: vzeroupper
693; AVX2-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000694;
Cong Houeb9c7052015-11-30 21:46:08 +0000695; AVX512BW-LABEL: avg_v16i16_const:
696; AVX512BW: # BB#0:
697; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
698; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
699; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
700; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000701 %1 = load <16 x i16>, <16 x i16>* %a
702 %2 = zext <16 x i16> %1 to <16 x i32>
703 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
704 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
705 %5 = trunc <16 x i32> %4 to <16 x i16>
706 store <16 x i16> %5, <16 x i16>* undef, align 4
707 ret void
708}
709
710define void @avg_v32i16_const(<32 x i16>* %a) {
Cong Houeb9c7052015-11-30 21:46:08 +0000711; AVX512BW-LABEL: avg_v32i16_const:
Cong Houbed60d32015-11-24 05:44:19 +0000712; AVX512BW: # BB#0:
Cong Houeb9c7052015-11-30 21:46:08 +0000713; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
714; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
715; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
716; AVX512BW-NEXT: retq
Cong Houbed60d32015-11-24 05:44:19 +0000717 %1 = load <32 x i16>, <32 x i16>* %a
718 %2 = zext <32 x i16> %1 to <32 x i32>
719 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
Cong Houeb9c7052015-11-30 21:46:08 +0000720 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
Cong Houbed60d32015-11-24 05:44:19 +0000721 %5 = trunc <32 x i32> %4 to <32 x i16>
722 store <32 x i16> %5, <32 x i16>* undef, align 4
723 ret void
724}