blob: dbf9d4481ffb1bb8943d50f6fdf21955d2e8e753 [file] [log] [blame]
Matt Arsenault65ad1602015-05-24 00:51:27 +00001; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
3
4; Run with devices with different unaligned load restrictions.
5
6; TODO: Vector element tests
7; TODO: Non-zero base offset for load and store combinations
8; TODO: Same base addrspacecasted
9
10
11; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
12; GCN: buffer_store_byte
13; GCN: buffer_store_byte
14; GCN: s_endpgm
15define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
16 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
17
18 store i8 123, i8 addrspace(1)* %out.gep.1
19 store i8 456, i8 addrspace(1)* %out, align 2
20 ret void
21}
22
23; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
24; GCN: buffer_store_byte
25; GCN: buffer_store_byte
26; GCN: s_endpgm
27define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
28 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
29
30 store i8 123, i8 addrspace(1)* %out.gep.1
31 store i8 456, i8 addrspace(1)* %out
32 ret void
33}
34
35; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
36; GCN: buffer_store_dword v
37define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
38 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
39
40 store i16 123, i16 addrspace(1)* %out.gep.1
41 store i16 456, i16 addrspace(1)* %out, align 4
42 ret void
43}
44
45; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
46; GCN: buffer_store_dword v
47define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
48 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
49
50 store i16 0, i16 addrspace(1)* %out.gep.1
51 store i16 0, i16 addrspace(1)* %out, align 4
52 ret void
53}
54
55; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
56; GCN: buffer_store_short
57; GCN: buffer_store_short
58; GCN: s_endpgm
59define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
60 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
61
62 store i16 123, i16 addrspace(1)* %out.gep.1
63 store i16 456, i16 addrspace(1)* %out
64 ret void
65}
66
67; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
68; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
69; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
70; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
71; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
72; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
73define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
74 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
75
76 store i32 123, i32 addrspace(1)* %out.gep.1
77 store i32 456, i32 addrspace(1)* %out
78 ret void
79}
80
81; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
82; GCN: buffer_store_dwordx2
83define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
84 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
85 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
86 store float 1.0, float addrspace(1)* %out.gep.1.bc
87 store i32 456, i32 addrspace(1)* %out
88 ret void
89}
90
91; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
Reid Kleckner2691c592015-06-11 17:25:24 +000092; GCN: buffer_store_dwordx2
Matt Arsenault65ad1602015-05-24 00:51:27 +000093define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
94 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
95 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
96 store i32 123, i32 addrspace(1)* %out.gep.1.bc
97 store float 4.0, float addrspace(1)* %out
98 ret void
99}
100
101; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
Reid Kleckner2691c592015-06-11 17:25:24 +0000102; GCN: buffer_store_dwordx4
Matt Arsenault65ad1602015-05-24 00:51:27 +0000103define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
104 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
105 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
106 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
107
108 store i32 123, i32 addrspace(1)* %out.gep.1
109 store i32 456, i32 addrspace(1)* %out.gep.2
110 store i32 333, i32 addrspace(1)* %out.gep.3
111 store i32 1234, i32 addrspace(1)* %out
112 ret void
113}
114
115; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
116; XGCN: buffer_store_dwordx4
117; GCN: buffer_store_dword v
118; GCN: buffer_store_dword v
119; GCN: buffer_store_dwordx2 v
120define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
121 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
122 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
123 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
124
125 store float 8.0, float addrspace(1)* %out
126 store float 1.0, float addrspace(1)* %out.gep.1
127 store float 2.0, float addrspace(1)* %out.gep.2
128 store float 4.0, float addrspace(1)* %out.gep.3
129 ret void
130}
131
132; First store is out of order. Because of order of combines, the
133; consecutive store fails because only some of the stores have been
134; replaced with integer constant stores, and then won't merge because
135; the types are different.
136
137; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
138; XGCN: buffer_store_dwordx4
139; GCN: buffer_store_dword v
140; GCN: buffer_store_dword v
141; GCN: buffer_store_dword v
142; GCN: buffer_store_dword v
143define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
144 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
145 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
146 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
147
148 store float 1.0, float addrspace(1)* %out.gep.1
149 store float 2.0, float addrspace(1)* %out.gep.2
150 store float 4.0, float addrspace(1)* %out.gep.3
151 store float 8.0, float addrspace(1)* %out
152 ret void
153}
154
155; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
156; SI-DAG: buffer_store_dwordx2
157; SI-DAG: buffer_store_dword
158; SI-NOT: buffer_store_dword
159; GCN: s_endpgm
160define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
161 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
162 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
163
164 store i32 123, i32 addrspace(1)* %out.gep.1
165 store i32 456, i32 addrspace(1)* %out.gep.2
166 store i32 1234, i32 addrspace(1)* %out
167 ret void
168}
169
170; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
171; XGCN: buffer_store_dwordx4
172; GCN: buffer_store_dwordx2
173; GCN: buffer_store_dwordx2
174define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
175 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
176
177 store i64 123, i64 addrspace(1)* %out.gep.1
178 store i64 456, i64 addrspace(1)* %out
179 ret void
180}
181
182; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
183; XGCN: buffer_store_dwordx4
184; XGCN: buffer_store_dwordx4
185
186; GCN: buffer_store_dwordx2
187; GCN: buffer_store_dwordx2
188; GCN: buffer_store_dwordx2
189; GCN: buffer_store_dwordx2
190define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
191 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
192 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
193 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
194
195 store i64 123, i64 addrspace(1)* %out.gep.1
196 store i64 456, i64 addrspace(1)* %out.gep.2
197 store i64 333, i64 addrspace(1)* %out.gep.3
198 store i64 1234, i64 addrspace(1)* %out
199 ret void
200}
201
202; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
203; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
204; GCN: buffer_store_dwordx2 [[LOAD]]
205define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
206 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
207 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
208
209 %lo = load i32, i32 addrspace(1)* %in
210 %hi = load i32, i32 addrspace(1)* %in.gep.1
211
212 store i32 %lo, i32 addrspace(1)* %out
213 store i32 %hi, i32 addrspace(1)* %out.gep.1
214 ret void
215}
216
217; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
218; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
219; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
220define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
221 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
222 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
223
224 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
225 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
226 %lo = load i32, i32 addrspace(1)* %in.gep.0
227 %hi = load i32, i32 addrspace(1)* %in.gep.1
228
229 store i32 %lo, i32 addrspace(1)* %out.gep.0
230 store i32 %hi, i32 addrspace(1)* %out.gep.1
231 ret void
232}
233
234; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
235; GCN: buffer_load_dword v
236; GCN: buffer_load_dword v
237; GCN: buffer_store_dword v
238; GCN: buffer_store_dword v
239define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
240 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
241 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
242
243 %lo = load i32, i32 addrspace(1)* %in
244 %hi = load i32, i32 addrspace(1)* %in.gep.1
245
246 store i32 %hi, i32 addrspace(1)* %out
247 store i32 %lo, i32 addrspace(1)* %out.gep.1
248 ret void
249}
250
251; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
252; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
253; GCN: buffer_store_dwordx4 [[LOAD]]
254define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
255 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
256 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
257 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
258 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
259 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
260 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
261
262 %x = load i32, i32 addrspace(1)* %in
263 %y = load i32, i32 addrspace(1)* %in.gep.1
264 %z = load i32, i32 addrspace(1)* %in.gep.2
265 %w = load i32, i32 addrspace(1)* %in.gep.3
266
267 store i32 %x, i32 addrspace(1)* %out
268 store i32 %y, i32 addrspace(1)* %out.gep.1
269 store i32 %z, i32 addrspace(1)* %out.gep.2
270 store i32 %w, i32 addrspace(1)* %out.gep.3
271 ret void
272}
273
274; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
275; SI-DAG: buffer_load_dwordx2
276; SI-DAG: buffer_load_dword v
277; GCN: s_waitcnt
278; SI-DAG: buffer_store_dword v
279; SI-DAG: buffer_store_dwordx2 v
280; GCN: s_endpgm
281define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
282 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
283 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
284 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
285 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
286
287 %x = load i32, i32 addrspace(1)* %in
288 %y = load i32, i32 addrspace(1)* %in.gep.1
289 %z = load i32, i32 addrspace(1)* %in.gep.2
290
291 store i32 %x, i32 addrspace(1)* %out
292 store i32 %y, i32 addrspace(1)* %out.gep.1
293 store i32 %z, i32 addrspace(1)* %out.gep.2
294 ret void
295}
296
297; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
298; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
299; GCN: buffer_store_dwordx4 [[LOAD]]
300define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
301 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
302 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
303 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
304 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
305 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
306 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
307
308 %x = load float, float addrspace(1)* %in
309 %y = load float, float addrspace(1)* %in.gep.1
310 %z = load float, float addrspace(1)* %in.gep.2
311 %w = load float, float addrspace(1)* %in.gep.3
312
313 store float %x, float addrspace(1)* %out
314 store float %y, float addrspace(1)* %out.gep.1
315 store float %z, float addrspace(1)* %out.gep.2
316 store float %w, float addrspace(1)* %out.gep.3
317 ret void
318}
319
320; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
321; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
322; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
323define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
324 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
325 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
326 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
327 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
328 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
329 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
330 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
331 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
332
333 %x = load i32, i32 addrspace(1)* %in.gep.0
334 %y = load i32, i32 addrspace(1)* %in.gep.1
335 %z = load i32, i32 addrspace(1)* %in.gep.2
336 %w = load i32, i32 addrspace(1)* %in.gep.3
337
338 store i32 %x, i32 addrspace(1)* %out.gep.0
339 store i32 %y, i32 addrspace(1)* %out.gep.1
340 store i32 %z, i32 addrspace(1)* %out.gep.2
341 store i32 %w, i32 addrspace(1)* %out.gep.3
342 ret void
343}
344
345; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
346; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
347; GCN: s_barrier
348; GCN: buffer_store_dwordx4 [[LOAD]]
349define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
350 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
351 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
352 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
353 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
354 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
355 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
356
357 %x = load i32, i32 addrspace(1)* %in
358 %y = load i32, i32 addrspace(1)* %in.gep.1
359 %z = load i32, i32 addrspace(1)* %in.gep.2
360 %w = load i32, i32 addrspace(1)* %in.gep.3
361
362 ; Make sure the barrier doesn't stop this
363 tail call void @llvm.AMDGPU.barrier.local() #1
364
365 store i32 %w, i32 addrspace(1)* %out.gep.3
366 store i32 %z, i32 addrspace(1)* %out.gep.2
367 store i32 %y, i32 addrspace(1)* %out.gep.1
368 store i32 %x, i32 addrspace(1)* %out
369
370 ret void
371}
372
373; TODO: Re-packing of loaded register required. Maybe an IR pass
374; should catch this?
375
376; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
377; GCN: buffer_load_dword v
378; GCN: buffer_load_dword v
379; GCN: buffer_load_dword v
380; GCN: buffer_load_dword v
381; GCN: s_barrier
382; GCN: buffer_store_dword v
383; GCN: buffer_store_dword v
384; GCN: buffer_store_dword v
385; GCN: buffer_store_dword v
386define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
387 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
388 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
389 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
390 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
391 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
392 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
393
394 %x = load i32, i32 addrspace(1)* %in
395 %y = load i32, i32 addrspace(1)* %in.gep.1
396 %z = load i32, i32 addrspace(1)* %in.gep.2
397 %w = load i32, i32 addrspace(1)* %in.gep.3
398
399 ; Make sure the barrier doesn't stop this
400 tail call void @llvm.AMDGPU.barrier.local() #1
401
402 store i32 %w, i32 addrspace(1)* %out
403 store i32 %z, i32 addrspace(1)* %out.gep.1
404 store i32 %y, i32 addrspace(1)* %out.gep.2
405 store i32 %x, i32 addrspace(1)* %out.gep.3
406
407 ret void
408}
409
410; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
411; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
412; GCN: buffer_store_dword [[LOAD]]
413; GCN: s_endpgm
414define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
415 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
416 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
417 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
418 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
419 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
420 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
421
422 %x = load i8, i8 addrspace(1)* %in, align 4
423 %y = load i8, i8 addrspace(1)* %in.gep.1
424 %z = load i8, i8 addrspace(1)* %in.gep.2
425 %w = load i8, i8 addrspace(1)* %in.gep.3
426
427 store i8 %x, i8 addrspace(1)* %out, align 4
428 store i8 %y, i8 addrspace(1)* %out.gep.1
429 store i8 %z, i8 addrspace(1)* %out.gep.2
430 store i8 %w, i8 addrspace(1)* %out.gep.3
431 ret void
432}
433
434; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
435; GCN: buffer_load_ubyte
436; GCN: buffer_load_ubyte
437; GCN: buffer_load_ubyte
438; GCN: buffer_load_ubyte
439; GCN: buffer_store_byte
440; GCN: buffer_store_byte
441; GCN: buffer_store_byte
442; GCN: buffer_store_byte
443; GCN: s_endpgm
444define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
445 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
446 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
447 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
448 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
449 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
450 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
451
452 %x = load i8, i8 addrspace(1)* %in
453 %y = load i8, i8 addrspace(1)* %in.gep.1
454 %z = load i8, i8 addrspace(1)* %in.gep.2
455 %w = load i8, i8 addrspace(1)* %in.gep.3
456
457 store i8 %x, i8 addrspace(1)* %out
458 store i8 %y, i8 addrspace(1)* %out.gep.1
459 store i8 %z, i8 addrspace(1)* %out.gep.2
460 store i8 %w, i8 addrspace(1)* %out.gep.3
461 ret void
462}
463
464; This works once AA is enabled on the subtarget
465; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
466; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
467; XGCN: buffer_store_dwordx4 [[LOAD]]
468; GCN: buffer_store_dword v
469; GCN: buffer_store_dword v
470; GCN: buffer_store_dword v
471; GCN: buffer_store_dword v
472define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
473 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
474 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
475 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
476 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
477
478 %x = extractelement <4 x i32> %vec, i32 0
479 %y = extractelement <4 x i32> %vec, i32 1
480 %z = extractelement <4 x i32> %vec, i32 2
481 %w = extractelement <4 x i32> %vec, i32 3
482
483 store i32 %x, i32 addrspace(1)* %out
484 store i32 %y, i32 addrspace(1)* %out.gep.1
485 store i32 %z, i32 addrspace(1)* %out.gep.2
486 store i32 %w, i32 addrspace(1)* %out.gep.3
487 ret void
488}
489
490; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
491; GCN: ds_write_b8
492; GCN: ds_write_b8
493; GCN: s_endpgm
494define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
495 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
496
497 store i8 123, i8 addrspace(3)* %out.gep.1
498 store i8 456, i8 addrspace(3)* %out, align 2
499 ret void
500}
501
502; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
503; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
504; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
505; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
506; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
507; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
508define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
509 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
510
511 store i32 123, i32 addrspace(3)* %out.gep.1
512 store i32 456, i32 addrspace(3)* %out
513 ret void
514}
515
516; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
517; GCN: ds_write_b32
518; GCN: ds_write_b32
519; GCN: ds_write_b32
520; GCN: ds_write_b32
521define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
522 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
523 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
524 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
525
526 store i32 123, i32 addrspace(3)* %out.gep.1
527 store i32 456, i32 addrspace(3)* %out.gep.2
528 store i32 333, i32 addrspace(3)* %out.gep.3
529 store i32 1234, i32 addrspace(3)* %out
530 ret void
531}
532
533declare void @llvm.AMDGPU.barrier.local() #1
534
535attributes #0 = { nounwind }
536attributes #1 = { noduplicate nounwind }