blob: 0c605f79d980bfe6c45606f388753710e11203ff [file] [log] [blame]
Matt Arsenaulta8160732018-08-15 21:34:06 +00001; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00003
Matt Arsenaulta8160732018-08-15 21:34:06 +00004; GCN-LABEL: {{^}}reduction_fadd_v4f16:
Farhana Aleene24f3ff2018-05-09 21:18:34 +00005; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
6; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00007
Matt Arsenault1349a042018-05-22 06:32:10 +00008; VI: v_add_f16_sdwa
Farhana Aleene2dfe8a2018-05-01 21:41:12 +00009; VI-NEXT: v_add_f16_e32
10; VI-NEXT: v_add_f16_e32
Matt Arsenaulta8160732018-08-15 21:34:06 +000011define half @reduction_fadd_v4f16(<4 x half> %vec4) {
Farhana Aleene2dfe8a2018-05-01 21:41:12 +000012entry:
13 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +000014 %bin.rdx = fadd <4 x half> %vec4, %rdx.shuf
Farhana Aleene2dfe8a2018-05-01 21:41:12 +000015 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +000016 %bin.rdx2 = fadd <4 x half> %bin.rdx, %rdx.shuf1
17 %res = extractelement <4 x half> %bin.rdx2, i32 0
18 ret half %res
19}
20
21; GCN-LABEL: {{^}}reduction_fsub_v4f16:
22; GFX9: s_waitcnt
23; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}}
24; GFX9-NEXT: v_sub_f16_sdwa v0, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
25; GFX9-NEXT: s_setpc_b64
26
27; VI: v_sub_f16_sdwa
28; VI-NEXT: v_sub_f16_e32
29; VI-NEXT: v_sub_f16_e32
30; VI-NEXT: s_setpc_b64
31define half @reduction_fsub_v4f16(<4 x half> %vec4) {
32entry:
33 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
34 %bin.rdx = fsub <4 x half> %vec4, %rdx.shuf
35 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
36 %bin.rdx2 = fsub <4 x half> %bin.rdx, %rdx.shuf1
37 %res = extractelement <4 x half> %bin.rdx2, i32 0
38 ret half %res
39}
40
41; Make sure nsz is preserved when the operations are split.
42; GCN-LABEL: {{^}}reduction_fsub_v4f16_preserve_fmf:
43; GFX9: s_waitcnt
44; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}}
45; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
46; GFX9-NEXT: s_setpc_b64
47
48; VI: s_waitcnt
49; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenaultbf07a502018-08-31 15:39:52 +000050; VI-NEXT: v_sub_f16_e32 v0, v1, v0
51; VI-NEXT: v_add_f16_e32 v0, v2, v0
Matt Arsenaulta8160732018-08-15 21:34:06 +000052; VI-NEXT: s_setpc_b64
53define half @reduction_fsub_v4f16_preserve_fmf(<4 x half> %vec4) {
54entry:
55 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
56 %bin.rdx = fsub nsz <4 x half> %vec4, %rdx.shuf
57 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
58 %bin.rdx2 = fsub nsz <4 x half> %bin.rdx, %rdx.shuf1
59 %res = extractelement <4 x half> %bin.rdx2, i32 0
60 %neg.res = fsub half -0.0, %res
61 ret half %neg.res
62}
63
64; GCN-LABEL: {{^}}reduction_fmul_half4:
65; GFX9: v_pk_mul_f16 [[MUL:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
66; GFX9-NEXT: v_mul_f16_sdwa v{{[0-9]+}}, [[MUL]], [[MUL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
67
68; VI: v_mul_f16_sdwa
69; VI-NEXT: v_mul_f16_e32
70; VI-NEXT: v_mul_f16_e32
71define half @reduction_fmul_half4(<4 x half> %vec4) {
72entry:
73 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
74 %bin.rdx = fmul <4 x half> %vec4, %rdx.shuf
75 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
76 %bin.rdx2 = fmul <4 x half> %bin.rdx, %rdx.shuf1
Farhana Aleene2dfe8a2018-05-01 21:41:12 +000077 %res = extractelement <4 x half> %bin.rdx2, i32 0
78 ret half %res
79}
80
81; GCN-LABEL: {{^}}reduction_v4i16:
Farhana Aleene24f3ff2018-05-09 21:18:34 +000082; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
83; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
Farhana Aleene2dfe8a2018-05-01 21:41:12 +000084
Matt Arsenault1349a042018-05-22 06:32:10 +000085; VI: v_add_u16_sdwa
Farhana Aleene2dfe8a2018-05-01 21:41:12 +000086; VI-NEXT: v_add_u16_e32
87; VI-NEXT: v_add_u16_e32
88define i16 @reduction_v4i16(<4 x i16> %vec4) {
89entry:
90 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
91 %bin.rdx = add <4 x i16> %vec4, %rdx.shuf
92 %rdx.shuf1 = shufflevector <4 x i16> %bin.rdx, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
93 %bin.rdx2 = add <4 x i16> %bin.rdx, %rdx.shuf1
94 %res = extractelement <4 x i16> %bin.rdx2, i32 0
95 ret i16 %res
96}
97
98; GCN-LABEL: {{^}}reduction_half8:
Farhana Aleene24f3ff2018-05-09 21:18:34 +000099; GFX9: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
100; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
101; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
102; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000103
Matt Arsenault1349a042018-05-22 06:32:10 +0000104; VI: v_add_f16_sdwa
105; VI-NEXT: v_add_f16_sdwa
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000106; VI-NEXT: v_add_f16_e32
107; VI-NEXT: v_add_f16_e32
108; VI-NEXT: v_add_f16_e32
109; VI-NEXT: v_add_f16_e32
110; VI-NEXT: v_add_f16_e32
111
112define half @reduction_half8(<8 x half> %vec8) {
113entry:
114 %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000115 %bin.rdx = fadd <8 x half> %vec8, %rdx.shuf
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000116 %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000117 %bin.rdx2 = fadd <8 x half> %bin.rdx, %rdx.shuf1
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000118 %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000119 %bin.rdx4 = fadd <8 x half> %bin.rdx2, %rdx.shuf3
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000120 %res = extractelement <8 x half> %bin.rdx4, i32 0
121 ret half %res
122}
123
124; GCN-LABEL: {{^}}reduction_v8i16:
Matt Arsenaulta8160732018-08-15 21:34:06 +0000125; GFX9: v_pk_add_u16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
126; GFX9-NEXT: v_pk_add_u16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
127; GFX9-NEXT: v_pk_add_u16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000128; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000129
Matt Arsenault1349a042018-05-22 06:32:10 +0000130; VI: v_add_u16_sdwa
131; VI-NEXT: v_add_u16_sdwa
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000132; VI-NEXT: v_add_u16_e32
133; VI-NEXT: v_add_u16_e32
134; VI-NEXT: v_add_u16_e32
135; VI-NEXT: v_add_u16_e32
136; VI-NEXT: v_add_u16_e32
137
138define i16 @reduction_v8i16(<8 x i16> %vec8) {
139entry:
140 %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
141 %bin.rdx = add <8 x i16> %vec8, %rdx.shuf
142 %rdx.shuf1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
143 %bin.rdx2 = add <8 x i16> %bin.rdx, %rdx.shuf1
144 %rdx.shuf3 = shufflevector <8 x i16> %bin.rdx2, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
145 %bin.rdx4 = add <8 x i16> %bin.rdx2, %rdx.shuf3
146 %res = extractelement <8 x i16> %bin.rdx4, i32 0
147 ret i16 %res
148}
149
150; GCN-LABEL: {{^}}reduction_half16:
151; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
152; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
153; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
154; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
Matt Arsenaulta8160732018-08-15 21:34:06 +0000155; GFX9-NEXT: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
156; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
157; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000158; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000159
Matt Arsenault1349a042018-05-22 06:32:10 +0000160; VI: v_add_f16_sdwa
161; VI-NEXT: v_add_f16_sdwa
162; VI-NEXT: v_add_f16_sdwa
163; VI-NEXT: v_add_f16_sdwa
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000164; VI-NEXT: v_add_f16_e32
165; VI-NEXT: v_add_f16_e32
166; VI-NEXT: v_add_f16_e32
167; VI-NEXT: v_add_f16_e32
168; VI-NEXT: v_add_f16_e32
169; VI-NEXT: v_add_f16_e32
170; VI-NEXT: v_add_f16_e32
171; VI-NEXT: v_add_f16_e32
172; VI-NEXT: v_add_f16_e32
173; VI-NEXT: v_add_f16_e32
174; VI-NEXT: v_add_f16_e32
175
176define half @reduction_half16(<16 x half> %vec16) {
177entry:
178 %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000179 %bin.rdx = fadd <16 x half> %vec16, %rdx.shuf
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000180 %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000181 %bin.rdx2 = fadd <16 x half> %bin.rdx, %rdx.shuf1
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000182 %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000183 %bin.rdx4 = fadd <16 x half> %bin.rdx2, %rdx.shuf3
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000184 %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000185 %bin.rdx6 = fadd <16 x half> %bin.rdx4, %rdx.shuf5
Farhana Aleene2dfe8a2018-05-01 21:41:12 +0000186 %res = extractelement <16 x half> %bin.rdx6, i32 0
187 ret half %res
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000188}
189
190; GCN-LABEL: {{^}}reduction_min_v4i16:
191; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
192; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
193
Matt Arsenault1349a042018-05-22 06:32:10 +0000194; VI: v_min_u16_sdwa
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000195; VI-NEXT: v_min_u16_e32
196; VI-NEXT: v_min_u16_e32
197define i16 @reduction_min_v4i16(<4 x i16> %vec4) {
198entry:
199 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
200 %rdx.minmax.cmp = icmp ult <4 x i16> %vec4, %rdx.shuf
201 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
202 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
203 %rdx.minmax.cmp2 = icmp ult <4 x i16> %rdx.minmax.select, %rdx.shuf1
204 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
205 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
206 ret i16 %res
207}
208
209; GCN-LABEL: {{^}}reduction_umin_v8i16:
210; GFX9: v_pk_min_u16 [[MIN1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
211; GFX9-NEXT: v_pk_min_u16 [[MIN2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
212; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}}
213; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
214
Matt Arsenault1349a042018-05-22 06:32:10 +0000215; VI: v_min_u16_sdwa
216; VI-NEXT: v_min_u16_sdwa
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000217; VI-NEXT: v_min_u16_e32
218; VI-NEXT: v_min_u16_e32
219; VI-NEXT: v_min_u16_e32
220; VI-NEXT: v_min_u16_e32
221; VI-NEXT: v_min_u16_e32
222define i16 @reduction_umin_v8i16(<8 x i16> %vec8) {
223entry:
224 %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
225 %rdx.minmax.cmp = icmp ult <8 x i16> %vec8, %rdx.shuf
226 %rdx.minmax.select = select <8 x i1> %rdx.minmax.cmp, <8 x i16> %vec8, <8 x i16> %rdx.shuf
227 %rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
228 %rdx.minmax.cmp2 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf1
229 %rdx.minmax.select3 = select <8 x i1> %rdx.minmax.cmp2, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf1
230 %rdx.shuf4 = shufflevector <8 x i16> %rdx.minmax.select3, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
231 %rdx.minmax.cmp5 = icmp ult <8 x i16> %rdx.minmax.select3, %rdx.shuf4
232 %rdx.minmax.select6 = select <8 x i1> %rdx.minmax.cmp5, <8 x i16> %rdx.minmax.select3, <8 x i16> %rdx.shuf4
233 %res = extractelement <8 x i16> %rdx.minmax.select6, i32 0
234 ret i16 %res
235}
236
237; Tests to make sure without slp the number of instructions are more.
238; GCN-LABEL: {{^}}reduction_umin_v8i16_woslp:
239; GFX9: v_lshrrev_b32_e32
240; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
241; GFX9-NEXT: v_lshrrev_b32_e32
242; GFX9-NEXT: v_min3_u16
243; GFX9-NEXT: v_lshrrev_b32_e32
244; GFX9-NEXT: v_min3_u16
245; GFX9-NEXT: v_min3_u16
246define i16 @reduction_umin_v8i16_woslp(<8 x i16> %vec8) {
247entry:
248 %elt0 = extractelement <8 x i16> %vec8, i64 0
249 %elt1 = extractelement <8 x i16> %vec8, i64 1
250 %elt2 = extractelement <8 x i16> %vec8, i64 2
251 %elt3 = extractelement <8 x i16> %vec8, i64 3
252 %elt4 = extractelement <8 x i16> %vec8, i64 4
253 %elt5 = extractelement <8 x i16> %vec8, i64 5
254 %elt6 = extractelement <8 x i16> %vec8, i64 6
255 %elt7 = extractelement <8 x i16> %vec8, i64 7
256
257 %cmp0 = icmp ult i16 %elt1, %elt0
258 %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
259 %cmp1 = icmp ult i16 %elt2, %min1
260 %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
261 %cmp2 = icmp ult i16 %elt3, %min2
262 %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
263
264 %cmp3 = icmp ult i16 %elt4, %min3
265 %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
266 %cmp4 = icmp ult i16 %elt5, %min4
267 %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
268
269 %cmp5 = icmp ult i16 %elt6, %min5
270 %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
271 %cmp6 = icmp ult i16 %elt7, %min6
272 %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
273
274 ret i16 %min7
275}
276
277; GCN-LABEL: {{^}}reduction_smin_v16i16:
278; GFX9: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
279; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
280; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
281; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
282; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
283; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
284; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
285; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
286
Matt Arsenault1349a042018-05-22 06:32:10 +0000287; VI: v_min_i16_sdwa
288; VI-NEXT: v_min_i16_sdwa
289; VI-NEXT: v_min_i16_sdwa
290; VI-NEXT: v_min_i16_sdwa
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000291; VI-NEXT: v_min_i16_e32
292; VI-NEXT: v_min_i16_e32
293; VI-NEXT: v_min_i16_e32
294; VI-NEXT: v_min_i16_e32
295; VI-NEXT: v_min_i16_e32
296; VI-NEXT: v_min_i16_e32
297; VI-NEXT: v_min_i16_e32
298; VI-NEXT: v_min_i16_e32
299; VI-NEXT: v_min_i16_e32
300; VI-NEXT: v_min_i16_e32
301; VI-NEXT: v_min_i16_e32
302define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
303entry:
304 %rdx.shuf = shufflevector <16 x i16> %vec16, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
305 %rdx.minmax.cmp = icmp slt <16 x i16> %vec16, %rdx.shuf
306 %rdx.minmax.select = select <16 x i1> %rdx.minmax.cmp, <16 x i16> %vec16, <16 x i16> %rdx.shuf
307 %rdx.shuf1 = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
308 %rdx.minmax.cmp2 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf1
309 %rdx.minmax.select3 = select <16 x i1> %rdx.minmax.cmp2, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf1
310 %rdx.shuf4 = shufflevector <16 x i16> %rdx.minmax.select3, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
311 %rdx.minmax.cmp5 = icmp slt <16 x i16> %rdx.minmax.select3, %rdx.shuf4
312 %rdx.minmax.select6 = select <16 x i1> %rdx.minmax.cmp5, <16 x i16> %rdx.minmax.select3, <16 x i16> %rdx.shuf4
313 %rdx.shuf7 = shufflevector <16 x i16> %rdx.minmax.select6, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
314 %rdx.minmax.cmp8 = icmp slt <16 x i16> %rdx.minmax.select6, %rdx.shuf7
315 %rdx.minmax.select9 = select <16 x i1> %rdx.minmax.cmp8, <16 x i16> %rdx.minmax.select6, <16 x i16> %rdx.shuf7
316 %res = extractelement <16 x i16> %rdx.minmax.select9, i32 0
317 ret i16 %res
318}
319
320; Tests to make sure without slp the number of instructions are more.
321; GCN-LABEL: {{^}}reduction_smin_v16i16_woslp:
322; GFX9: v_lshrrev_b32_e32
323; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
324; GFX9-NEXT: v_lshrrev_b32_e32
325; GFX9-NEXT: v_min3_i16
326; GFX9-NEXT: v_lshrrev_b32_e32
327; GFX9-NEXT: v_min3_i16
328; GFX9-NEXT: v_lshrrev_b32_e32
329; GFX9-NEXT: v_min3_i16
330; GFX9-NEXT: v_lshrrev_b32_e32
331; GFX9-NEXT: v_min3_i16
332; GFX9-NEXT: v_lshrrev_b32_e32
333; GFX9-NEXT: v_min3_i16
334; GFX9-NEXT: v_lshrrev_b32_e32
335; GFX9-NEXT: v_min3_i16
336; GFX9-NEXT: v_min3_i16
337define i16 @reduction_smin_v16i16_woslp(<16 x i16> %vec16) {
338entry:
339 %elt0 = extractelement <16 x i16> %vec16, i64 0
340 %elt1 = extractelement <16 x i16> %vec16, i64 1
341 %elt2 = extractelement <16 x i16> %vec16, i64 2
342 %elt3 = extractelement <16 x i16> %vec16, i64 3
343 %elt4 = extractelement <16 x i16> %vec16, i64 4
344 %elt5 = extractelement <16 x i16> %vec16, i64 5
345 %elt6 = extractelement <16 x i16> %vec16, i64 6
346 %elt7 = extractelement <16 x i16> %vec16, i64 7
347
348 %elt8 = extractelement <16 x i16> %vec16, i64 8
349 %elt9 = extractelement <16 x i16> %vec16, i64 9
350 %elt10 = extractelement <16 x i16> %vec16, i64 10
351 %elt11 = extractelement <16 x i16> %vec16, i64 11
352 %elt12 = extractelement <16 x i16> %vec16, i64 12
353 %elt13 = extractelement <16 x i16> %vec16, i64 13
354 %elt14 = extractelement <16 x i16> %vec16, i64 14
355 %elt15 = extractelement <16 x i16> %vec16, i64 15
356
357 %cmp0 = icmp slt i16 %elt1, %elt0
358 %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
359 %cmp1 = icmp slt i16 %elt2, %min1
360 %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
361 %cmp2 = icmp slt i16 %elt3, %min2
362 %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
363
364 %cmp3 = icmp slt i16 %elt4, %min3
365 %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
366 %cmp4 = icmp slt i16 %elt5, %min4
367 %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
368
369 %cmp5 = icmp slt i16 %elt6, %min5
370 %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
371 %cmp6 = icmp slt i16 %elt7, %min6
372 %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
373
374 %cmp7 = icmp slt i16 %elt8, %min7
375 %min8 = select i1 %cmp7, i16 %elt8, i16 %min7
376 %cmp8 = icmp slt i16 %elt9, %min8
377 %min9 = select i1 %cmp8, i16 %elt9, i16 %min8
378
379 %cmp9 = icmp slt i16 %elt10, %min9
380 %min10 = select i1 %cmp9, i16 %elt10, i16 %min9
381 %cmp10 = icmp slt i16 %elt11, %min10
382 %min11 = select i1 %cmp10, i16 %elt11, i16 %min10
383
384 %cmp11 = icmp slt i16 %elt12, %min11
385 %min12 = select i1 %cmp11, i16 %elt12, i16 %min11
386 %cmp12 = icmp slt i16 %elt13, %min12
387 %min13 = select i1 %cmp12, i16 %elt13, i16 %min12
388
389 %cmp13 = icmp slt i16 %elt14, %min13
390 %min14 = select i1 %cmp13, i16 %elt14, i16 %min13
391 %cmp14 = icmp slt i16 %elt15, %min14
392 %min15 = select i1 %cmp14, i16 %elt15, i16 %min14
393
394
395 ret i16 %min15
396}
397
398; GCN-LABEL: {{^}}reduction_umax_v4i16:
399; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
400; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
401
Matt Arsenault1349a042018-05-22 06:32:10 +0000402; VI: v_max_u16_sdwa
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000403; VI-NEXT: v_max_u16_e32
404; VI-NEXT: v_max_u16_e32
405define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
406entry:
407 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
408 %rdx.minmax.cmp = icmp ugt <4 x i16> %vec4, %rdx.shuf
409 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
410 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
411 %rdx.minmax.cmp2 = icmp ugt <4 x i16> %rdx.minmax.select, %rdx.shuf1
412 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
413 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
414 ret i16 %res
415}
416
417; GCN-LABEL: {{^}}reduction_smax_v4i16:
418; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
419; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
420
Matt Arsenault1349a042018-05-22 06:32:10 +0000421; VI: v_max_i16_sdwa
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000422; VI-NEXT: v_max_i16_e32
423; VI-NEXT: v_max_i16_e32
424define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 {
425entry:
426 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
427 %rdx.minmax.cmp = icmp sgt <4 x i16> %vec4, %rdx.shuf
428 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
429 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
430 %rdx.minmax.cmp2 = icmp sgt <4 x i16> %rdx.minmax.select, %rdx.shuf1
431 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
432 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
433 ret i16 %res
434}
435
Matt Arsenaulta8160732018-08-15 21:34:06 +0000436; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
Matt Arsenault687ec752018-10-22 16:27:27 +0000437; GFX9: s_waitcnt
438; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
439; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
440; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000441
Matt Arsenault687ec752018-10-22 16:27:27 +0000442; FIXME: Extra canonicalize leftover
443; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
444; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
445
446; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
447; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
448; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
449; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
450
451; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
452; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
453; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
Matt Arsenaulta8160732018-08-15 21:34:06 +0000454define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000455entry:
456 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000457 %rdx.minmax = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf)
458 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
459 %rdx.minmax3 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1)
460 %res = extractelement <4 x half> %rdx.minmax3, i32 0
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000461 ret half %res
462}
463
Matt Arsenaulta8160732018-08-15 21:34:06 +0000464; GCN-LABEL: {{^}}reduction_minnum_v4f16:
Matt Arsenault687ec752018-10-22 16:27:27 +0000465; GFX9: s_waitcnt
466; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
467; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
468; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000469
Matt Arsenault687ec752018-10-22 16:27:27 +0000470; FIXME: Extra canonicalize leftover
471; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
472; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
473
474
475; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
476; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
477; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
478; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
479
480; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
481; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
482; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
Matt Arsenaulta8160732018-08-15 21:34:06 +0000483define half @reduction_minnum_v4f16(<4 x half> %vec4) {
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000484entry:
485 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000486 %rdx.minmax = call <4 x half> @llvm.minnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf)
487 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
488 %rdx.minmax3 = call <4 x half> @llvm.minnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1)
489 %res = extractelement <4 x half> %rdx.minmax3, i32 0
490 ret half %res
491}
492
Matt Arsenault687ec752018-10-22 16:27:27 +0000493; FIXME: Need to preserve fast math flags when fmaxnum matched
494; directly from the IR to avoid unnecessary quieting.
Matt Arsenaulta8160732018-08-15 21:34:06 +0000495
Matt Arsenault687ec752018-10-22 16:27:27 +0000496; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
497; XGFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
498; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
499
500; XVI: s_waitcnt
501; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
502; XVI-NEXT: v_max_f16_e32 v0, v0, v1
503; XVI-NEXT: v_max_f16_e32 v0, v0, v2
504; XVI-NEXT: s_setpc_b64
505
506; GFX9: s_waitcnt
507; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
508; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
509; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
510
511; FIXME: Extra canonicalize leftover
512; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
513; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
514
515; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
516; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
517; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
518; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
519
520; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
521; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
522; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
Matt Arsenaulta8160732018-08-15 21:34:06 +0000523define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
524entry:
525 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
526 %rdx.minmax.cmp = fcmp nnan nsz ogt <4 x half> %vec4, %rdx.shuf
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000527 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
528 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
Matt Arsenaulta8160732018-08-15 21:34:06 +0000529 %rdx.minmax.cmp2 = fcmp nnan nsz ogt <4 x half> %rdx.minmax.select, %rdx.shuf1
Farhana Aleene24f3ff2018-05-09 21:18:34 +0000530 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
531 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
532 ret half %res
Matt Arsenault1349a042018-05-22 06:32:10 +0000533}
Matt Arsenaulta8160732018-08-15 21:34:06 +0000534
Matt Arsenault687ec752018-10-22 16:27:27 +0000535; FIXME: Need to preserve fast math flags when fmaxnum matched
536; directly from the IR to avoid unnecessary quieting.
Matt Arsenaulta8160732018-08-15 21:34:06 +0000537
Matt Arsenault687ec752018-10-22 16:27:27 +0000538; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
539; XGFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
540; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
541
542; XVI: s_waitcnt
543; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
544; XVI-NEXT: v_min_f16_e32 v0, v0, v1
545; XVI-NEXT: v_min_f16_e32 v0, v0, v2
546; XVI-NEXT: s_setpc_b64
547
548; GFX9: s_waitcnt
549; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
550; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
551; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
552
553; FIXME: Extra canonicalize leftover
554; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
555; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
556
557
558; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
559; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
560; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
561; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
562
563; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
564; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
565; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
Matt Arsenaulta8160732018-08-15 21:34:06 +0000566define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
567entry:
568 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
569 %rdx.minmax.cmp = fcmp nnan nsz olt <4 x half> %vec4, %rdx.shuf
570 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
571 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
572 %rdx.minmax.cmp2 = fcmp nnan nsz olt <4 x half> %rdx.minmax.select, %rdx.shuf1
573 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
574 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
575 ret half %res
576}
577
578declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
579declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)