blob: fec27e7168ce74595d4cd35597d4dd9214ccd1ce [file] [log] [blame]
Matt Arsenaulte5d95152015-10-13 00:49:00 +00001; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
3
4; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
5; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
Matt Arsenault65ad1602015-05-24 00:51:27 +00006
7; Run with devices with different unaligned load restrictions.
8
9; TODO: Vector element tests
10; TODO: Non-zero base offset for load and store combinations
11; TODO: Same base addrspacecasted
12
13
14; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
15; GCN: buffer_store_byte
16; GCN: buffer_store_byte
17; GCN: s_endpgm
18define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
19 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
20
21 store i8 123, i8 addrspace(1)* %out.gep.1
22 store i8 456, i8 addrspace(1)* %out, align 2
23 ret void
24}
25
26; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
27; GCN: buffer_store_byte
28; GCN: buffer_store_byte
29; GCN: s_endpgm
30define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
31 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
32
33 store i8 123, i8 addrspace(1)* %out.gep.1
34 store i8 456, i8 addrspace(1)* %out
35 ret void
36}
37
38; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
39; GCN: buffer_store_dword v
40define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
41 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
42
43 store i16 123, i16 addrspace(1)* %out.gep.1
44 store i16 456, i16 addrspace(1)* %out, align 4
45 ret void
46}
47
48; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
49; GCN: buffer_store_dword v
50define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
51 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
52
53 store i16 0, i16 addrspace(1)* %out.gep.1
54 store i16 0, i16 addrspace(1)* %out, align 4
55 ret void
56}
57
58; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
59; GCN: buffer_store_short
60; GCN: buffer_store_short
61; GCN: s_endpgm
62define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
63 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
64
65 store i16 123, i16 addrspace(1)* %out.gep.1
66 store i16 456, i16 addrspace(1)* %out
67 ret void
68}
69
70; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
Matt Arsenault0de924b2015-11-02 23:15:42 +000071; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
72; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
Matt Arsenault65ad1602015-05-24 00:51:27 +000073; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
74define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
75 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
76
77 store i32 123, i32 addrspace(1)* %out.gep.1
78 store i32 456, i32 addrspace(1)* %out
79 ret void
80}
81
82; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
83; GCN: buffer_store_dwordx2
84define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
85 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
86 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
87 store float 1.0, float addrspace(1)* %out.gep.1.bc
88 store i32 456, i32 addrspace(1)* %out
89 ret void
90}
91
92; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
Matt Arsenault0de924b2015-11-02 23:15:42 +000093; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
94; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
Matt Arsenaulted891b52015-06-16 15:51:48 +000095; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
Matt Arsenault65ad1602015-05-24 00:51:27 +000096define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
97 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
98 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
99 store i32 123, i32 addrspace(1)* %out.gep.1.bc
100 store float 4.0, float addrspace(1)* %out
101 ret void
102}
103
104; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
Matt Arsenaulted891b52015-06-16 15:51:48 +0000105; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
106; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
107; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
108; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
109; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
Matt Arsenault65ad1602015-05-24 00:51:27 +0000110define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
111 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
112 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
113 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
114
115 store i32 123, i32 addrspace(1)* %out.gep.1
116 store i32 456, i32 addrspace(1)* %out.gep.2
117 store i32 333, i32 addrspace(1)* %out.gep.3
118 store i32 1234, i32 addrspace(1)* %out
119 ret void
120}
121
122; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
Matt Arsenaultb7748342015-09-21 15:59:46 +0000123; GCN: buffer_store_dwordx4
Matt Arsenault65ad1602015-05-24 00:51:27 +0000124define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
125 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
126 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
127 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
128
129 store float 8.0, float addrspace(1)* %out
130 store float 1.0, float addrspace(1)* %out.gep.1
131 store float 2.0, float addrspace(1)* %out.gep.2
132 store float 4.0, float addrspace(1)* %out.gep.3
133 ret void
134}
135
Matt Arsenaultb7748342015-09-21 15:59:46 +0000136; First store is out of order.
Matt Arsenault65ad1602015-05-24 00:51:27 +0000137; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
Matt Arsenaultb7748342015-09-21 15:59:46 +0000138; GCN: buffer_store_dwordx4
Matt Arsenault65ad1602015-05-24 00:51:27 +0000139define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
140 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
141 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
142 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
143
144 store float 1.0, float addrspace(1)* %out.gep.1
145 store float 2.0, float addrspace(1)* %out.gep.2
146 store float 4.0, float addrspace(1)* %out.gep.3
147 store float 8.0, float addrspace(1)* %out
148 ret void
149}
150
Matt Arsenaultb7748342015-09-21 15:59:46 +0000151; FIXME: Should be able to merge this
152; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
Matt Arsenaulte5d95152015-10-13 00:49:00 +0000153; GCN-NOAA: buffer_store_dword v
154; GCN-NOAA: buffer_store_dword v
155; GCN-NOAA: buffer_store_dword v
156; GCN-NOAA: buffer_store_dword v
157
158; GCN-AA: buffer_store_dwordx2
159; GCN-AA: buffer_store_dword v
160; GCN-AA: buffer_store_dword v
161
Matt Arsenaultb7748342015-09-21 15:59:46 +0000162; GCN: s_endpgm
163define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
164 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
165 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
166 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
167
168 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
169 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
170
171 store i32 11, i32 addrspace(1)* %out.gep.1.bc
172 store float 2.0, float addrspace(1)* %out.gep.2
173 store i32 17, i32 addrspace(1)* %out.gep.3.bc
174 store float 8.0, float addrspace(1)* %out
175 ret void
176}
177
Matt Arsenault65ad1602015-05-24 00:51:27 +0000178; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
179; SI-DAG: buffer_store_dwordx2
180; SI-DAG: buffer_store_dword
181; SI-NOT: buffer_store_dword
182; GCN: s_endpgm
183define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
184 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
185 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
186
187 store i32 123, i32 addrspace(1)* %out.gep.1
188 store i32 456, i32 addrspace(1)* %out.gep.2
189 store i32 1234, i32 addrspace(1)* %out
190 ret void
191}
192
193; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
194; XGCN: buffer_store_dwordx4
195; GCN: buffer_store_dwordx2
196; GCN: buffer_store_dwordx2
197define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
198 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
199
200 store i64 123, i64 addrspace(1)* %out.gep.1
201 store i64 456, i64 addrspace(1)* %out
202 ret void
203}
204
205; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
206; XGCN: buffer_store_dwordx4
207; XGCN: buffer_store_dwordx4
208
209; GCN: buffer_store_dwordx2
210; GCN: buffer_store_dwordx2
211; GCN: buffer_store_dwordx2
212; GCN: buffer_store_dwordx2
213define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
214 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
215 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
216 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
217
218 store i64 123, i64 addrspace(1)* %out.gep.1
219 store i64 456, i64 addrspace(1)* %out.gep.2
220 store i64 333, i64 addrspace(1)* %out.gep.3
221 store i64 1234, i64 addrspace(1)* %out
222 ret void
223}
224
225; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
226; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
227; GCN: buffer_store_dwordx2 [[LOAD]]
228define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
229 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
230 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
231
232 %lo = load i32, i32 addrspace(1)* %in
233 %hi = load i32, i32 addrspace(1)* %in.gep.1
234
235 store i32 %lo, i32 addrspace(1)* %out
236 store i32 %hi, i32 addrspace(1)* %out.gep.1
237 ret void
238}
239
240; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
241; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
242; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
243define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
244 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
245 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
246
247 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
248 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
249 %lo = load i32, i32 addrspace(1)* %in.gep.0
250 %hi = load i32, i32 addrspace(1)* %in.gep.1
251
252 store i32 %lo, i32 addrspace(1)* %out.gep.0
253 store i32 %hi, i32 addrspace(1)* %out.gep.1
254 ret void
255}
256
257; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
258; GCN: buffer_load_dword v
259; GCN: buffer_load_dword v
260; GCN: buffer_store_dword v
261; GCN: buffer_store_dword v
262define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
263 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
264 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
265
266 %lo = load i32, i32 addrspace(1)* %in
267 %hi = load i32, i32 addrspace(1)* %in.gep.1
268
269 store i32 %hi, i32 addrspace(1)* %out
270 store i32 %lo, i32 addrspace(1)* %out.gep.1
271 ret void
272}
273
274; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
275; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
276; GCN: buffer_store_dwordx4 [[LOAD]]
277define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
278 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
279 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
280 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
281 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
282 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
283 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
284
285 %x = load i32, i32 addrspace(1)* %in
286 %y = load i32, i32 addrspace(1)* %in.gep.1
287 %z = load i32, i32 addrspace(1)* %in.gep.2
288 %w = load i32, i32 addrspace(1)* %in.gep.3
289
290 store i32 %x, i32 addrspace(1)* %out
291 store i32 %y, i32 addrspace(1)* %out.gep.1
292 store i32 %z, i32 addrspace(1)* %out.gep.2
293 store i32 %w, i32 addrspace(1)* %out.gep.3
294 ret void
295}
296
297; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
298; SI-DAG: buffer_load_dwordx2
299; SI-DAG: buffer_load_dword v
300; GCN: s_waitcnt
301; SI-DAG: buffer_store_dword v
302; SI-DAG: buffer_store_dwordx2 v
303; GCN: s_endpgm
304define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
305 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
306 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
307 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
308 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
309
310 %x = load i32, i32 addrspace(1)* %in
311 %y = load i32, i32 addrspace(1)* %in.gep.1
312 %z = load i32, i32 addrspace(1)* %in.gep.2
313
314 store i32 %x, i32 addrspace(1)* %out
315 store i32 %y, i32 addrspace(1)* %out.gep.1
316 store i32 %z, i32 addrspace(1)* %out.gep.2
317 ret void
318}
319
320; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
321; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
322; GCN: buffer_store_dwordx4 [[LOAD]]
323define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
324 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
325 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
326 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
327 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
328 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
329 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
330
331 %x = load float, float addrspace(1)* %in
332 %y = load float, float addrspace(1)* %in.gep.1
333 %z = load float, float addrspace(1)* %in.gep.2
334 %w = load float, float addrspace(1)* %in.gep.3
335
336 store float %x, float addrspace(1)* %out
337 store float %y, float addrspace(1)* %out.gep.1
338 store float %z, float addrspace(1)* %out.gep.2
339 store float %w, float addrspace(1)* %out.gep.3
340 ret void
341}
342
343; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
344; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
345; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
346define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
347 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
348 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
349 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
350 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
351 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
352 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
353 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
354 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
355
356 %x = load i32, i32 addrspace(1)* %in.gep.0
357 %y = load i32, i32 addrspace(1)* %in.gep.1
358 %z = load i32, i32 addrspace(1)* %in.gep.2
359 %w = load i32, i32 addrspace(1)* %in.gep.3
360
361 store i32 %x, i32 addrspace(1)* %out.gep.0
362 store i32 %y, i32 addrspace(1)* %out.gep.1
363 store i32 %z, i32 addrspace(1)* %out.gep.2
364 store i32 %w, i32 addrspace(1)* %out.gep.3
365 ret void
366}
367
368; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
369; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
370; GCN: s_barrier
371; GCN: buffer_store_dwordx4 [[LOAD]]
372define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
373 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
374 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
375 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
376 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
377 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
378 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
379
380 %x = load i32, i32 addrspace(1)* %in
381 %y = load i32, i32 addrspace(1)* %in.gep.1
382 %z = load i32, i32 addrspace(1)* %in.gep.2
383 %w = load i32, i32 addrspace(1)* %in.gep.3
384
385 ; Make sure the barrier doesn't stop this
386 tail call void @llvm.AMDGPU.barrier.local() #1
387
388 store i32 %w, i32 addrspace(1)* %out.gep.3
389 store i32 %z, i32 addrspace(1)* %out.gep.2
390 store i32 %y, i32 addrspace(1)* %out.gep.1
391 store i32 %x, i32 addrspace(1)* %out
392
393 ret void
394}
395
396; TODO: Re-packing of loaded register required. Maybe an IR pass
397; should catch this?
398
399; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
400; GCN: buffer_load_dword v
401; GCN: buffer_load_dword v
402; GCN: buffer_load_dword v
403; GCN: buffer_load_dword v
404; GCN: s_barrier
405; GCN: buffer_store_dword v
406; GCN: buffer_store_dword v
407; GCN: buffer_store_dword v
408; GCN: buffer_store_dword v
409define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
410 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
411 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
412 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
413 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
414 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
415 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
416
417 %x = load i32, i32 addrspace(1)* %in
418 %y = load i32, i32 addrspace(1)* %in.gep.1
419 %z = load i32, i32 addrspace(1)* %in.gep.2
420 %w = load i32, i32 addrspace(1)* %in.gep.3
421
422 ; Make sure the barrier doesn't stop this
423 tail call void @llvm.AMDGPU.barrier.local() #1
424
425 store i32 %w, i32 addrspace(1)* %out
426 store i32 %z, i32 addrspace(1)* %out.gep.1
427 store i32 %y, i32 addrspace(1)* %out.gep.2
428 store i32 %x, i32 addrspace(1)* %out.gep.3
429
430 ret void
431}
432
433; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
434; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
435; GCN: buffer_store_dword [[LOAD]]
436; GCN: s_endpgm
437define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
438 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
439 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
440 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
441 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
442 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
443 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
444
445 %x = load i8, i8 addrspace(1)* %in, align 4
446 %y = load i8, i8 addrspace(1)* %in.gep.1
447 %z = load i8, i8 addrspace(1)* %in.gep.2
448 %w = load i8, i8 addrspace(1)* %in.gep.3
449
450 store i8 %x, i8 addrspace(1)* %out, align 4
451 store i8 %y, i8 addrspace(1)* %out.gep.1
452 store i8 %z, i8 addrspace(1)* %out.gep.2
453 store i8 %w, i8 addrspace(1)* %out.gep.3
454 ret void
455}
456
457; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
458; GCN: buffer_load_ubyte
459; GCN: buffer_load_ubyte
460; GCN: buffer_load_ubyte
461; GCN: buffer_load_ubyte
462; GCN: buffer_store_byte
463; GCN: buffer_store_byte
464; GCN: buffer_store_byte
465; GCN: buffer_store_byte
466; GCN: s_endpgm
467define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
468 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
469 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
470 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
471 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
472 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
473 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
474
475 %x = load i8, i8 addrspace(1)* %in
476 %y = load i8, i8 addrspace(1)* %in.gep.1
477 %z = load i8, i8 addrspace(1)* %in.gep.2
478 %w = load i8, i8 addrspace(1)* %in.gep.3
479
480 store i8 %x, i8 addrspace(1)* %out
481 store i8 %y, i8 addrspace(1)* %out.gep.1
482 store i8 %z, i8 addrspace(1)* %out.gep.2
483 store i8 %w, i8 addrspace(1)* %out.gep.3
484 ret void
485}
486
487; This works once AA is enabled on the subtarget
488; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
489; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
Matt Arsenaulte5d95152015-10-13 00:49:00 +0000490
491; GCN-NOAA: buffer_store_dword v
492; GCN-NOAA: buffer_store_dword v
493; GCN-NOAA: buffer_store_dword v
494; GCN-NOAA: buffer_store_dword v
495
496; GCN-AA: buffer_store_dwordx4 [[LOAD]]
497
498; GCN: s_endpgm
Matt Arsenault65ad1602015-05-24 00:51:27 +0000499define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
500 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
501 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
502 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
503 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
504
505 %x = extractelement <4 x i32> %vec, i32 0
506 %y = extractelement <4 x i32> %vec, i32 1
507 %z = extractelement <4 x i32> %vec, i32 2
508 %w = extractelement <4 x i32> %vec, i32 3
509
510 store i32 %x, i32 addrspace(1)* %out
511 store i32 %y, i32 addrspace(1)* %out.gep.1
512 store i32 %z, i32 addrspace(1)* %out.gep.2
513 store i32 %w, i32 addrspace(1)* %out.gep.3
514 ret void
515}
516
517; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
518; GCN: ds_write_b8
519; GCN: ds_write_b8
520; GCN: s_endpgm
521define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
522 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
523
524 store i8 123, i8 addrspace(3)* %out.gep.1
525 store i8 456, i8 addrspace(3)* %out, align 2
526 ret void
527}
528
529; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
Tom Stellard9a197672015-09-09 15:43:26 +0000530; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
531; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
Matt Arsenault65ad1602015-05-24 00:51:27 +0000532; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
533define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
534 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
535
536 store i32 123, i32 addrspace(3)* %out.gep.1
537 store i32 456, i32 addrspace(3)* %out
538 ret void
539}
540
541; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
Matt Arsenaultff05da82015-11-24 12:18:54 +0000542; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
543; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
544; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
545
546; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
547; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
548; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
549
550; GCN: s_endpgm
Matt Arsenault65ad1602015-05-24 00:51:27 +0000551define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
552 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
553 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
554 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
555
556 store i32 123, i32 addrspace(3)* %out.gep.1
557 store i32 456, i32 addrspace(3)* %out.gep.2
558 store i32 333, i32 addrspace(3)* %out.gep.3
559 store i32 1234, i32 addrspace(3)* %out
560 ret void
561}
562
Matt Arsenaulted891b52015-06-16 15:51:48 +0000563; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
564; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
565; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
566; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
567; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
568; GCN: buffer_store_dword v[[HI]]
569define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
570 store i32 9, i32 addrspace(1)* %out, align 4
571 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
572 store i32 12, i32 addrspace(1)* %idx1, align 4
573 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
574 store i32 16, i32 addrspace(1)* %idx2, align 4
575 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
576 store i32 -12, i32 addrspace(1)* %idx3, align 4
577 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
578 store i32 11, i32 addrspace(1)* %idx4, align 4
579 ret void
580}
581
582; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
583; GCN: buffer_store_dwordx4
584; GCN: buffer_store_dwordx2
585define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
586 store i32 13, i32 addrspace(1)* %out, align 4
587 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
588 store i32 15, i32 addrspace(1)* %idx1, align 4
589 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
590 store i32 62, i32 addrspace(1)* %idx2, align 4
591 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
592 store i32 63, i32 addrspace(1)* %idx3, align 4
593 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
594 store i32 11, i32 addrspace(1)* %idx4, align 4
595 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
596 store i32 123, i32 addrspace(1)* %idx5, align 4
597 ret void
598}
599
600; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
601; GCN: buffer_store_dwordx4
602; GCN: buffer_store_dwordx2
603; GCN: buffer_store_dword v
604define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
605 store i32 34, i32 addrspace(1)* %out, align 4
606 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
607 store i32 999, i32 addrspace(1)* %idx1, align 4
608 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
609 store i32 65, i32 addrspace(1)* %idx2, align 4
610 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
611 store i32 33, i32 addrspace(1)* %idx3, align 4
612 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
613 store i32 98, i32 addrspace(1)* %idx4, align 4
614 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
615 store i32 91, i32 addrspace(1)* %idx5, align 4
616 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
617 store i32 212, i32 addrspace(1)* %idx6, align 4
618 ret void
619}
620
621; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
Matt Arsenault4d801cd2015-11-24 12:05:03 +0000622; GCN: buffer_store_dwordx4
623; GCN: buffer_store_dwordx4
Matt Arsenaulte5d95152015-10-13 00:49:00 +0000624; GCN: s_endpgm
Matt Arsenaulted891b52015-06-16 15:51:48 +0000625define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
626 store i32 34, i32 addrspace(1)* %out, align 4
627 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
628 store i32 999, i32 addrspace(1)* %idx1, align 4
629 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
630 store i32 65, i32 addrspace(1)* %idx2, align 4
631 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
632 store i32 33, i32 addrspace(1)* %idx3, align 4
633 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
634 store i32 98, i32 addrspace(1)* %idx4, align 4
635 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
636 store i32 91, i32 addrspace(1)* %idx5, align 4
637 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
638 store i32 212, i32 addrspace(1)* %idx6, align 4
639 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
640 store i32 999, i32 addrspace(1)* %idx7, align 4
641 ret void
642}
643
Matt Arsenault65ad1602015-05-24 00:51:27 +0000644declare void @llvm.AMDGPU.barrier.local() #1
645
646attributes #0 = { nounwind }
647attributes #1 = { noduplicate nounwind }