blob: db3c88a1edc17e1b892ad42aacef4ab44b8e64df [file] [log] [blame]
Matt Arsenaulteb522e62017-02-27 22:15:25 +00001; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
3
4; FIXME: Need to handle non-uniform case for function below (load without gep).
5; GCN-LABEL: {{^}}v_test_add_v2i16:
6; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
7
8; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
9; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
10define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
11 %tid = call i32 @llvm.amdgcn.workitem.id.x()
12 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
13 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
14 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
15 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
16 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
17 %add = add <2 x i16> %a, %b
18 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
19 ret void
20}
21
22; GCN-LABEL: {{^}}s_test_add_v2i16:
23; GFX9: s_load_dword [[VAL0:s[0-9]+]]
24; GFX9: s_load_dword [[VAL1:s[0-9]+]]
25; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
26; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
27
28; VI: s_add_i32
29; VI: s_add_i32
30define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
31 %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
32 %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
33 %add = add <2 x i16> %a, %b
34 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
35 ret void
36}
37
38; GCN-LABEL: {{^}}s_test_add_self_v2i16:
39; GFX9: s_load_dword [[VAL:s[0-9]+]]
40; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL]], [[VAL]]
41
42; VI: s_add_i32
43; VI: s_add_i32
44define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
45 %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
46 %add = add <2 x i16> %a, %a
47 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
48 ret void
49}
50
51; FIXME: VI should not scalarize arg access.
52; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
53; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
54
55; VI: v_add_i32
56; VI: v_add_i32
57define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
58 %add = add <2 x i16> %a, %b
59 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
60 ret void
61}
62
63; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
64; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
65; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
66
67; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
68; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
69define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
70 %tid = call i32 @llvm.amdgcn.workitem.id.x()
71 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
72 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
73 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
74 %add = add <2 x i16> %a, <i16 123, i16 456>
75 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
76 ret void
77}
78
79; FIXME: Need to handle non-uniform case for function below (load without gep).
80; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant:
81; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
82; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
83
84; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
85; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
86define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
87 %tid = call i32 @llvm.amdgcn.workitem.id.x()
88 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
89 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
90 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
91 %add = add <2 x i16> %a, <i16 -845, i16 -991>
92 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
93 ret void
94}
95
96; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
97; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
98
99; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
100; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
101; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]]
102; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
103; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
104; VI: v_or_b32_e32
105define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
106 %tid = call i32 @llvm.amdgcn.workitem.id.x()
107 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
108 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
109 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
110 %add = add <2 x i16> %a, <i16 -1, i16 -1>
111 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
112 ret void
113}
114
115; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
116; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
117; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
118
119; VI-NOT: v_add_u16
120; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
121; VI-NOT: v_add_u16
122; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
123; VI: v_or_b32_e32
124define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
125 %tid = call i32 @llvm.amdgcn.workitem.id.x()
126 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
127 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
128 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
129 %add = add <2 x i16> %a, <i16 32, i16 0>
130 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
131 ret void
132}
133
134; The high element gives fp
135; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split:
136; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
137; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
138
139; VI-NOT: v_add_u16
140; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}}
141; VI-NOT: v_add_u16
142; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
143; VI: v_or_b32_e32
144define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
145 %tid = call i32 @llvm.amdgcn.workitem.id.x()
146 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
147 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
148 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
149 %add = add <2 x i16> %a, <i16 0, i16 16256>
150 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
151 ret void
152}
153
154; FIXME: Need to handle non-uniform case for function below (load without gep).
155; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i32:
156; GFX9: flat_load_dword [[A:v[0-9]+]]
157; GFX9: flat_load_dword [[B:v[0-9]+]]
158
159; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
160; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
161; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
162; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
163
164; VI: flat_load_ushort v[[A_HI:[0-9]+]]
165; VI: flat_load_ushort v[[A_LO:[0-9]+]]
166; VI: flat_load_ushort v[[B_HI:[0-9]+]]
167; VI: flat_load_ushort v[[B_LO:[0-9]+]]
168
169; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
170; VI-NOT: and
171; VI-NOT: shl
172; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
173; VI-NOT: and
174; VI-NOT: shl
175; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
176define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
177 %tid = call i32 @llvm.amdgcn.workitem.id.x()
178 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
179 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
180 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
181 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
182 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
183 %add = add <2 x i16> %a, %b
184 %ext = zext <2 x i16> %add to <2 x i32>
185 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
186 ret void
187}
188
189; FIXME: Need to handle non-uniform case for function below (load without gep).
190; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64:
191; GFX9: flat_load_dword [[A:v[0-9]+]]
192; GFX9: flat_load_dword [[B:v[0-9]+]]
193
Matthias Braundbcf9e22017-03-02 00:35:08 +0000194; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000195; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
196; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
197; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000198; GFX9: buffer_store_dwordx4
199
200; VI: flat_load_ushort v[[A_LO:[0-9]+]]
201; VI: flat_load_ushort v[[A_HI:[0-9]+]]
202; VI: flat_load_ushort v[[B_LO:[0-9]+]]
203; VI: flat_load_ushort v[[B_HI:[0-9]+]]
204
205; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000206; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
Matthias Braundbcf9e22017-03-02 00:35:08 +0000207; VI: v_add_u16_e32
208; VI: v_add_u16_e32
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000209
210; VI: buffer_store_dwordx4
211define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
212 %tid = call i32 @llvm.amdgcn.workitem.id.x()
213 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
214 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
215 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
216 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
217 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
218 %add = add <2 x i16> %a, %b
219 %ext = zext <2 x i16> %add to <2 x i64>
220 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
221 ret void
222}
223
224; FIXME: Need to handle non-uniform case for function below (load without gep).
225; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i32:
226; GFX9: flat_load_dword [[A:v[0-9]+]]
227; GFX9: flat_load_dword [[B:v[0-9]+]]
228
229; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
230; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
231; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
232; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
233
234; VI: v_add_u16_e32
235; VI: v_add_u16_e32
236; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
237; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
238; VI: buffer_store_dwordx2
239define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
240 %tid = call i32 @llvm.amdgcn.workitem.id.x()
241 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
242 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
243 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
244 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
245 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
246 %add = add <2 x i16> %a, %b
247 %ext = sext <2 x i16> %add to <2 x i32>
248 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
249 ret void
250}
251
252; FIXME: Need to handle non-uniform case for function below (load without gep).
253; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i64:
254; GCN: flat_load_dword
255; GCN: flat_load_dword
256
257; GFX9: v_pk_add_u16
258; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
259
260; VI: v_add_u16_e32
261; VI: v_add_u16_e32
262
263; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
264; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
265; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
266; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
267define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
268 %tid = call i32 @llvm.amdgcn.workitem.id.x()
269 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
270 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
271 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
272 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
273 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
274 %add = add <2 x i16> %a, %b
275 %ext = sext <2 x i16> %add to <2 x i64>
276 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
277 ret void
278}
279
280declare i32 @llvm.amdgcn.workitem.id.x() #0
281
282attributes #0 = { nounwind readnone }
283attributes #1 = { nounwind }