blob: 87148ae9f69c5fa8cc4efedc53a6f124ed258137 [file] [log] [blame]
Matt Arsenault65ad1602015-05-24 00:51:27 +00001; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
3
4; Run with devices with different unaligned load restrictions.
5
6; TODO: Vector element tests
7; TODO: Non-zero base offset for load and store combinations
8; TODO: Same base addrspacecasted
9
10
11; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
12; GCN: buffer_store_byte
13; GCN: buffer_store_byte
14; GCN: s_endpgm
15define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
16 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
17
18 store i8 123, i8 addrspace(1)* %out.gep.1
19 store i8 456, i8 addrspace(1)* %out, align 2
20 ret void
21}
22
23; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
24; GCN: buffer_store_byte
25; GCN: buffer_store_byte
26; GCN: s_endpgm
27define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
28 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
29
30 store i8 123, i8 addrspace(1)* %out.gep.1
31 store i8 456, i8 addrspace(1)* %out
32 ret void
33}
34
35; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
36; GCN: buffer_store_dword v
37define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
38 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
39
40 store i16 123, i16 addrspace(1)* %out.gep.1
41 store i16 456, i16 addrspace(1)* %out, align 4
42 ret void
43}
44
45; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
46; GCN: buffer_store_dword v
47define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
48 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
49
50 store i16 0, i16 addrspace(1)* %out.gep.1
51 store i16 0, i16 addrspace(1)* %out, align 4
52 ret void
53}
54
55; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
56; GCN: buffer_store_short
57; GCN: buffer_store_short
58; GCN: s_endpgm
59define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
60 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
61
62 store i16 123, i16 addrspace(1)* %out.gep.1
63 store i16 456, i16 addrspace(1)* %out
64 ret void
65}
66
67; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
68; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
69; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
70; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
71; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
72; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
73define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
74 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
75
76 store i32 123, i32 addrspace(1)* %out.gep.1
77 store i32 456, i32 addrspace(1)* %out
78 ret void
79}
80
81; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
82; GCN: buffer_store_dwordx2
83define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
84 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
85 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
86 store float 1.0, float addrspace(1)* %out.gep.1.bc
87 store i32 456, i32 addrspace(1)* %out
88 ret void
89}
90
91; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
Matt Arsenaulted891b52015-06-16 15:51:48 +000092; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
93; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
94; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
95; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
96; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
Matt Arsenault65ad1602015-05-24 00:51:27 +000097define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
98 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
99 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
100 store i32 123, i32 addrspace(1)* %out.gep.1.bc
101 store float 4.0, float addrspace(1)* %out
102 ret void
103}
104
105; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
Matt Arsenaulted891b52015-06-16 15:51:48 +0000106; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
107; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
108; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
109; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
110; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
Matt Arsenault65ad1602015-05-24 00:51:27 +0000111define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
112 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
113 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
114 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
115
116 store i32 123, i32 addrspace(1)* %out.gep.1
117 store i32 456, i32 addrspace(1)* %out.gep.2
118 store i32 333, i32 addrspace(1)* %out.gep.3
119 store i32 1234, i32 addrspace(1)* %out
120 ret void
121}
122
123; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
124; XGCN: buffer_store_dwordx4
125; GCN: buffer_store_dword v
126; GCN: buffer_store_dword v
127; GCN: buffer_store_dwordx2 v
128define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
129 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
130 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
131 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
132
133 store float 8.0, float addrspace(1)* %out
134 store float 1.0, float addrspace(1)* %out.gep.1
135 store float 2.0, float addrspace(1)* %out.gep.2
136 store float 4.0, float addrspace(1)* %out.gep.3
137 ret void
138}
139
140; First store is out of order. Because of order of combines, the
141; consecutive store fails because only some of the stores have been
142; replaced with integer constant stores, and then won't merge because
143; the types are different.
144
145; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
146; XGCN: buffer_store_dwordx4
147; GCN: buffer_store_dword v
148; GCN: buffer_store_dword v
149; GCN: buffer_store_dword v
150; GCN: buffer_store_dword v
151define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
152 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
153 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
154 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
155
156 store float 1.0, float addrspace(1)* %out.gep.1
157 store float 2.0, float addrspace(1)* %out.gep.2
158 store float 4.0, float addrspace(1)* %out.gep.3
159 store float 8.0, float addrspace(1)* %out
160 ret void
161}
162
163; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
164; SI-DAG: buffer_store_dwordx2
165; SI-DAG: buffer_store_dword
166; SI-NOT: buffer_store_dword
167; GCN: s_endpgm
168define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
169 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
170 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
171
172 store i32 123, i32 addrspace(1)* %out.gep.1
173 store i32 456, i32 addrspace(1)* %out.gep.2
174 store i32 1234, i32 addrspace(1)* %out
175 ret void
176}
177
178; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
179; XGCN: buffer_store_dwordx4
180; GCN: buffer_store_dwordx2
181; GCN: buffer_store_dwordx2
182define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
183 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
184
185 store i64 123, i64 addrspace(1)* %out.gep.1
186 store i64 456, i64 addrspace(1)* %out
187 ret void
188}
189
190; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
191; XGCN: buffer_store_dwordx4
192; XGCN: buffer_store_dwordx4
193
194; GCN: buffer_store_dwordx2
195; GCN: buffer_store_dwordx2
196; GCN: buffer_store_dwordx2
197; GCN: buffer_store_dwordx2
198define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
199 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
200 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
201 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
202
203 store i64 123, i64 addrspace(1)* %out.gep.1
204 store i64 456, i64 addrspace(1)* %out.gep.2
205 store i64 333, i64 addrspace(1)* %out.gep.3
206 store i64 1234, i64 addrspace(1)* %out
207 ret void
208}
209
210; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
211; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
212; GCN: buffer_store_dwordx2 [[LOAD]]
213define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
214 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
215 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
216
217 %lo = load i32, i32 addrspace(1)* %in
218 %hi = load i32, i32 addrspace(1)* %in.gep.1
219
220 store i32 %lo, i32 addrspace(1)* %out
221 store i32 %hi, i32 addrspace(1)* %out.gep.1
222 ret void
223}
224
225; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
226; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
227; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
228define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
229 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
230 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
231
232 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
233 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
234 %lo = load i32, i32 addrspace(1)* %in.gep.0
235 %hi = load i32, i32 addrspace(1)* %in.gep.1
236
237 store i32 %lo, i32 addrspace(1)* %out.gep.0
238 store i32 %hi, i32 addrspace(1)* %out.gep.1
239 ret void
240}
241
242; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
243; GCN: buffer_load_dword v
244; GCN: buffer_load_dword v
245; GCN: buffer_store_dword v
246; GCN: buffer_store_dword v
247define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
248 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
249 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
250
251 %lo = load i32, i32 addrspace(1)* %in
252 %hi = load i32, i32 addrspace(1)* %in.gep.1
253
254 store i32 %hi, i32 addrspace(1)* %out
255 store i32 %lo, i32 addrspace(1)* %out.gep.1
256 ret void
257}
258
259; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
260; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
261; GCN: buffer_store_dwordx4 [[LOAD]]
262define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
263 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
264 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
265 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
266 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
267 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
268 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
269
270 %x = load i32, i32 addrspace(1)* %in
271 %y = load i32, i32 addrspace(1)* %in.gep.1
272 %z = load i32, i32 addrspace(1)* %in.gep.2
273 %w = load i32, i32 addrspace(1)* %in.gep.3
274
275 store i32 %x, i32 addrspace(1)* %out
276 store i32 %y, i32 addrspace(1)* %out.gep.1
277 store i32 %z, i32 addrspace(1)* %out.gep.2
278 store i32 %w, i32 addrspace(1)* %out.gep.3
279 ret void
280}
281
282; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
283; SI-DAG: buffer_load_dwordx2
284; SI-DAG: buffer_load_dword v
285; GCN: s_waitcnt
286; SI-DAG: buffer_store_dword v
287; SI-DAG: buffer_store_dwordx2 v
288; GCN: s_endpgm
289define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
290 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
291 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
292 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
293 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
294
295 %x = load i32, i32 addrspace(1)* %in
296 %y = load i32, i32 addrspace(1)* %in.gep.1
297 %z = load i32, i32 addrspace(1)* %in.gep.2
298
299 store i32 %x, i32 addrspace(1)* %out
300 store i32 %y, i32 addrspace(1)* %out.gep.1
301 store i32 %z, i32 addrspace(1)* %out.gep.2
302 ret void
303}
304
305; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
306; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
307; GCN: buffer_store_dwordx4 [[LOAD]]
308define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
309 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
310 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
311 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
312 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
313 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
314 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
315
316 %x = load float, float addrspace(1)* %in
317 %y = load float, float addrspace(1)* %in.gep.1
318 %z = load float, float addrspace(1)* %in.gep.2
319 %w = load float, float addrspace(1)* %in.gep.3
320
321 store float %x, float addrspace(1)* %out
322 store float %y, float addrspace(1)* %out.gep.1
323 store float %z, float addrspace(1)* %out.gep.2
324 store float %w, float addrspace(1)* %out.gep.3
325 ret void
326}
327
328; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
329; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
330; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
331define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
332 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
333 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
334 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
335 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
336 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
337 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
338 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
339 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
340
341 %x = load i32, i32 addrspace(1)* %in.gep.0
342 %y = load i32, i32 addrspace(1)* %in.gep.1
343 %z = load i32, i32 addrspace(1)* %in.gep.2
344 %w = load i32, i32 addrspace(1)* %in.gep.3
345
346 store i32 %x, i32 addrspace(1)* %out.gep.0
347 store i32 %y, i32 addrspace(1)* %out.gep.1
348 store i32 %z, i32 addrspace(1)* %out.gep.2
349 store i32 %w, i32 addrspace(1)* %out.gep.3
350 ret void
351}
352
353; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
354; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
355; GCN: s_barrier
356; GCN: buffer_store_dwordx4 [[LOAD]]
357define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
358 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
359 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
360 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
361 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
362 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
363 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
364
365 %x = load i32, i32 addrspace(1)* %in
366 %y = load i32, i32 addrspace(1)* %in.gep.1
367 %z = load i32, i32 addrspace(1)* %in.gep.2
368 %w = load i32, i32 addrspace(1)* %in.gep.3
369
370 ; Make sure the barrier doesn't stop this
371 tail call void @llvm.AMDGPU.barrier.local() #1
372
373 store i32 %w, i32 addrspace(1)* %out.gep.3
374 store i32 %z, i32 addrspace(1)* %out.gep.2
375 store i32 %y, i32 addrspace(1)* %out.gep.1
376 store i32 %x, i32 addrspace(1)* %out
377
378 ret void
379}
380
381; TODO: Re-packing of loaded register required. Maybe an IR pass
382; should catch this?
383
384; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
385; GCN: buffer_load_dword v
386; GCN: buffer_load_dword v
387; GCN: buffer_load_dword v
388; GCN: buffer_load_dword v
389; GCN: s_barrier
390; GCN: buffer_store_dword v
391; GCN: buffer_store_dword v
392; GCN: buffer_store_dword v
393; GCN: buffer_store_dword v
394define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
395 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
396 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
397 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
398 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
399 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
400 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
401
402 %x = load i32, i32 addrspace(1)* %in
403 %y = load i32, i32 addrspace(1)* %in.gep.1
404 %z = load i32, i32 addrspace(1)* %in.gep.2
405 %w = load i32, i32 addrspace(1)* %in.gep.3
406
407 ; Make sure the barrier doesn't stop this
408 tail call void @llvm.AMDGPU.barrier.local() #1
409
410 store i32 %w, i32 addrspace(1)* %out
411 store i32 %z, i32 addrspace(1)* %out.gep.1
412 store i32 %y, i32 addrspace(1)* %out.gep.2
413 store i32 %x, i32 addrspace(1)* %out.gep.3
414
415 ret void
416}
417
418; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
419; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
420; GCN: buffer_store_dword [[LOAD]]
421; GCN: s_endpgm
422define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
423 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
424 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
425 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
426 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
427 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
428 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
429
430 %x = load i8, i8 addrspace(1)* %in, align 4
431 %y = load i8, i8 addrspace(1)* %in.gep.1
432 %z = load i8, i8 addrspace(1)* %in.gep.2
433 %w = load i8, i8 addrspace(1)* %in.gep.3
434
435 store i8 %x, i8 addrspace(1)* %out, align 4
436 store i8 %y, i8 addrspace(1)* %out.gep.1
437 store i8 %z, i8 addrspace(1)* %out.gep.2
438 store i8 %w, i8 addrspace(1)* %out.gep.3
439 ret void
440}
441
442; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
443; GCN: buffer_load_ubyte
444; GCN: buffer_load_ubyte
445; GCN: buffer_load_ubyte
446; GCN: buffer_load_ubyte
447; GCN: buffer_store_byte
448; GCN: buffer_store_byte
449; GCN: buffer_store_byte
450; GCN: buffer_store_byte
451; GCN: s_endpgm
452define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
453 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
454 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
455 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
456 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
457 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
458 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
459
460 %x = load i8, i8 addrspace(1)* %in
461 %y = load i8, i8 addrspace(1)* %in.gep.1
462 %z = load i8, i8 addrspace(1)* %in.gep.2
463 %w = load i8, i8 addrspace(1)* %in.gep.3
464
465 store i8 %x, i8 addrspace(1)* %out
466 store i8 %y, i8 addrspace(1)* %out.gep.1
467 store i8 %z, i8 addrspace(1)* %out.gep.2
468 store i8 %w, i8 addrspace(1)* %out.gep.3
469 ret void
470}
471
472; This works once AA is enabled on the subtarget
473; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
474; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
475; XGCN: buffer_store_dwordx4 [[LOAD]]
476; GCN: buffer_store_dword v
477; GCN: buffer_store_dword v
478; GCN: buffer_store_dword v
479; GCN: buffer_store_dword v
480define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
481 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
482 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
483 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
484 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
485
486 %x = extractelement <4 x i32> %vec, i32 0
487 %y = extractelement <4 x i32> %vec, i32 1
488 %z = extractelement <4 x i32> %vec, i32 2
489 %w = extractelement <4 x i32> %vec, i32 3
490
491 store i32 %x, i32 addrspace(1)* %out
492 store i32 %y, i32 addrspace(1)* %out.gep.1
493 store i32 %z, i32 addrspace(1)* %out.gep.2
494 store i32 %w, i32 addrspace(1)* %out.gep.3
495 ret void
496}
497
498; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
499; GCN: ds_write_b8
500; GCN: ds_write_b8
501; GCN: s_endpgm
502define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
503 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
504
505 store i8 123, i8 addrspace(3)* %out.gep.1
506 store i8 456, i8 addrspace(3)* %out, align 2
507 ret void
508}
509
510; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
Tom Stellard9a197672015-09-09 15:43:26 +0000511; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
512; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
Matt Arsenault65ad1602015-05-24 00:51:27 +0000513; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
514define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
515 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
516
517 store i32 123, i32 addrspace(3)* %out.gep.1
518 store i32 456, i32 addrspace(3)* %out
519 ret void
520}
521
522; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
523; GCN: ds_write_b32
524; GCN: ds_write_b32
525; GCN: ds_write_b32
526; GCN: ds_write_b32
527define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
528 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
529 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
530 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
531
532 store i32 123, i32 addrspace(3)* %out.gep.1
533 store i32 456, i32 addrspace(3)* %out.gep.2
534 store i32 333, i32 addrspace(3)* %out.gep.3
535 store i32 1234, i32 addrspace(3)* %out
536 ret void
537}
538
Matt Arsenaulted891b52015-06-16 15:51:48 +0000539; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
540; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
541; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
542; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
543; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
544; GCN: buffer_store_dword v[[HI]]
545define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
546 store i32 9, i32 addrspace(1)* %out, align 4
547 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
548 store i32 12, i32 addrspace(1)* %idx1, align 4
549 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
550 store i32 16, i32 addrspace(1)* %idx2, align 4
551 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
552 store i32 -12, i32 addrspace(1)* %idx3, align 4
553 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
554 store i32 11, i32 addrspace(1)* %idx4, align 4
555 ret void
556}
557
558; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
559; GCN: buffer_store_dwordx4
560; GCN: buffer_store_dwordx2
561define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
562 store i32 13, i32 addrspace(1)* %out, align 4
563 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
564 store i32 15, i32 addrspace(1)* %idx1, align 4
565 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
566 store i32 62, i32 addrspace(1)* %idx2, align 4
567 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
568 store i32 63, i32 addrspace(1)* %idx3, align 4
569 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
570 store i32 11, i32 addrspace(1)* %idx4, align 4
571 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
572 store i32 123, i32 addrspace(1)* %idx5, align 4
573 ret void
574}
575
576; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
577; GCN: buffer_store_dwordx4
578; GCN: buffer_store_dwordx2
579; GCN: buffer_store_dword v
580define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
581 store i32 34, i32 addrspace(1)* %out, align 4
582 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
583 store i32 999, i32 addrspace(1)* %idx1, align 4
584 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
585 store i32 65, i32 addrspace(1)* %idx2, align 4
586 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
587 store i32 33, i32 addrspace(1)* %idx3, align 4
588 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
589 store i32 98, i32 addrspace(1)* %idx4, align 4
590 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
591 store i32 91, i32 addrspace(1)* %idx5, align 4
592 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
593 store i32 212, i32 addrspace(1)* %idx6, align 4
594 ret void
595}
596
597; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
598; XGCN: buffer_store_dwordx4
599; XGCN: buffer_store_dwordx4
600
601; GCN: buffer_store_dword v
602; GCN: buffer_store_dword v
603; GCN: buffer_store_dword v
604; GCN: buffer_store_dword v
605; GCN: buffer_store_dword v
606; GCN: buffer_store_dword v
607; GCN: buffer_store_dword v
608; GCN: buffer_store_dword v
609define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
610 store i32 34, i32 addrspace(1)* %out, align 4
611 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
612 store i32 999, i32 addrspace(1)* %idx1, align 4
613 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
614 store i32 65, i32 addrspace(1)* %idx2, align 4
615 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
616 store i32 33, i32 addrspace(1)* %idx3, align 4
617 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
618 store i32 98, i32 addrspace(1)* %idx4, align 4
619 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
620 store i32 91, i32 addrspace(1)* %idx5, align 4
621 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
622 store i32 212, i32 addrspace(1)* %idx6, align 4
623 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
624 store i32 999, i32 addrspace(1)* %idx7, align 4
625 ret void
626}
627
Matt Arsenault65ad1602015-05-24 00:51:27 +0000628declare void @llvm.AMDGPU.barrier.local() #1
629
630attributes #0 = { nounwind }
631attributes #1 = { noduplicate nounwind }