blob: fac043e85246f949e16fc997d4f6a88aed1c46d7 [file] [log] [blame]
Matt Arsenaulte5d95152015-10-13 00:49:00 +00001; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
3
4; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
5; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
Matt Arsenault65ad1602015-05-24 00:51:27 +00006
7; Run with devices with different unaligned load restrictions.
8
9; TODO: Vector element tests
10; TODO: Non-zero base offset for load and store combinations
11; TODO: Same base addrspacecasted
12
13
14; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
15; GCN: buffer_store_byte
16; GCN: buffer_store_byte
17; GCN: s_endpgm
18define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
19 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
20
21 store i8 123, i8 addrspace(1)* %out.gep.1
22 store i8 456, i8 addrspace(1)* %out, align 2
23 ret void
24}
25
26; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
27; GCN: buffer_store_byte
28; GCN: buffer_store_byte
29; GCN: s_endpgm
30define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
31 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
32
33 store i8 123, i8 addrspace(1)* %out.gep.1
34 store i8 456, i8 addrspace(1)* %out
35 ret void
36}
37
38; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
39; GCN: buffer_store_dword v
40define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
41 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
42
43 store i16 123, i16 addrspace(1)* %out.gep.1
44 store i16 456, i16 addrspace(1)* %out, align 4
45 ret void
46}
47
48; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
49; GCN: buffer_store_dword v
50define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
51 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
52
53 store i16 0, i16 addrspace(1)* %out.gep.1
54 store i16 0, i16 addrspace(1)* %out, align 4
55 ret void
56}
57
58; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
59; GCN: buffer_store_short
60; GCN: buffer_store_short
61; GCN: s_endpgm
62define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
63 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
64
65 store i16 123, i16 addrspace(1)* %out.gep.1
66 store i16 456, i16 addrspace(1)* %out
67 ret void
68}
69
70; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
71; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
72; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
73; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
74; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
75; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
76define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
77 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
78
79 store i32 123, i32 addrspace(1)* %out.gep.1
80 store i32 456, i32 addrspace(1)* %out
81 ret void
82}
83
84; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
85; GCN: buffer_store_dwordx2
86define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
87 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
88 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
89 store float 1.0, float addrspace(1)* %out.gep.1.bc
90 store i32 456, i32 addrspace(1)* %out
91 ret void
92}
93
94; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
Matt Arsenaulted891b52015-06-16 15:51:48 +000095; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
96; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
97; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
98; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
99; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
Matt Arsenault65ad1602015-05-24 00:51:27 +0000100define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
101 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
102 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
103 store i32 123, i32 addrspace(1)* %out.gep.1.bc
104 store float 4.0, float addrspace(1)* %out
105 ret void
106}
107
108; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
Matt Arsenaulted891b52015-06-16 15:51:48 +0000109; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
110; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
111; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
112; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
113; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
Matt Arsenault65ad1602015-05-24 00:51:27 +0000114define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
115 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
116 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
117 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
118
119 store i32 123, i32 addrspace(1)* %out.gep.1
120 store i32 456, i32 addrspace(1)* %out.gep.2
121 store i32 333, i32 addrspace(1)* %out.gep.3
122 store i32 1234, i32 addrspace(1)* %out
123 ret void
124}
125
126; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
Matt Arsenaultb7748342015-09-21 15:59:46 +0000127; GCN: buffer_store_dwordx4
Matt Arsenault65ad1602015-05-24 00:51:27 +0000128define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
129 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
130 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
131 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
132
133 store float 8.0, float addrspace(1)* %out
134 store float 1.0, float addrspace(1)* %out.gep.1
135 store float 2.0, float addrspace(1)* %out.gep.2
136 store float 4.0, float addrspace(1)* %out.gep.3
137 ret void
138}
139
Matt Arsenaultb7748342015-09-21 15:59:46 +0000140; First store is out of order.
Matt Arsenault65ad1602015-05-24 00:51:27 +0000141; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
Matt Arsenaultb7748342015-09-21 15:59:46 +0000142; GCN: buffer_store_dwordx4
Matt Arsenault65ad1602015-05-24 00:51:27 +0000143define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
144 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
145 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
146 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
147
148 store float 1.0, float addrspace(1)* %out.gep.1
149 store float 2.0, float addrspace(1)* %out.gep.2
150 store float 4.0, float addrspace(1)* %out.gep.3
151 store float 8.0, float addrspace(1)* %out
152 ret void
153}
154
Matt Arsenaultb7748342015-09-21 15:59:46 +0000155; FIXME: Should be able to merge this
156; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
Matt Arsenaulte5d95152015-10-13 00:49:00 +0000157; GCN-NOAA: buffer_store_dword v
158; GCN-NOAA: buffer_store_dword v
159; GCN-NOAA: buffer_store_dword v
160; GCN-NOAA: buffer_store_dword v
161
162; GCN-AA: buffer_store_dwordx2
163; GCN-AA: buffer_store_dword v
164; GCN-AA: buffer_store_dword v
165
Matt Arsenaultb7748342015-09-21 15:59:46 +0000166; GCN: s_endpgm
167define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
168 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
169 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
170 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
171
172 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
173 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
174
175 store i32 11, i32 addrspace(1)* %out.gep.1.bc
176 store float 2.0, float addrspace(1)* %out.gep.2
177 store i32 17, i32 addrspace(1)* %out.gep.3.bc
178 store float 8.0, float addrspace(1)* %out
179 ret void
180}
181
Matt Arsenault65ad1602015-05-24 00:51:27 +0000182; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
183; SI-DAG: buffer_store_dwordx2
184; SI-DAG: buffer_store_dword
185; SI-NOT: buffer_store_dword
186; GCN: s_endpgm
187define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
188 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
189 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
190
191 store i32 123, i32 addrspace(1)* %out.gep.1
192 store i32 456, i32 addrspace(1)* %out.gep.2
193 store i32 1234, i32 addrspace(1)* %out
194 ret void
195}
196
197; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
198; XGCN: buffer_store_dwordx4
199; GCN: buffer_store_dwordx2
200; GCN: buffer_store_dwordx2
201define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
202 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
203
204 store i64 123, i64 addrspace(1)* %out.gep.1
205 store i64 456, i64 addrspace(1)* %out
206 ret void
207}
208
209; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
210; XGCN: buffer_store_dwordx4
211; XGCN: buffer_store_dwordx4
212
213; GCN: buffer_store_dwordx2
214; GCN: buffer_store_dwordx2
215; GCN: buffer_store_dwordx2
216; GCN: buffer_store_dwordx2
217define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
218 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
219 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
220 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
221
222 store i64 123, i64 addrspace(1)* %out.gep.1
223 store i64 456, i64 addrspace(1)* %out.gep.2
224 store i64 333, i64 addrspace(1)* %out.gep.3
225 store i64 1234, i64 addrspace(1)* %out
226 ret void
227}
228
229; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
230; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
231; GCN: buffer_store_dwordx2 [[LOAD]]
232define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
233 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
234 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
235
236 %lo = load i32, i32 addrspace(1)* %in
237 %hi = load i32, i32 addrspace(1)* %in.gep.1
238
239 store i32 %lo, i32 addrspace(1)* %out
240 store i32 %hi, i32 addrspace(1)* %out.gep.1
241 ret void
242}
243
244; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
245; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
246; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
247define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
248 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
249 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
250
251 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
252 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
253 %lo = load i32, i32 addrspace(1)* %in.gep.0
254 %hi = load i32, i32 addrspace(1)* %in.gep.1
255
256 store i32 %lo, i32 addrspace(1)* %out.gep.0
257 store i32 %hi, i32 addrspace(1)* %out.gep.1
258 ret void
259}
260
261; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
262; GCN: buffer_load_dword v
263; GCN: buffer_load_dword v
264; GCN: buffer_store_dword v
265; GCN: buffer_store_dword v
266define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
267 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
268 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
269
270 %lo = load i32, i32 addrspace(1)* %in
271 %hi = load i32, i32 addrspace(1)* %in.gep.1
272
273 store i32 %hi, i32 addrspace(1)* %out
274 store i32 %lo, i32 addrspace(1)* %out.gep.1
275 ret void
276}
277
278; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
279; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
280; GCN: buffer_store_dwordx4 [[LOAD]]
281define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
282 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
283 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
284 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
285 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
286 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
287 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
288
289 %x = load i32, i32 addrspace(1)* %in
290 %y = load i32, i32 addrspace(1)* %in.gep.1
291 %z = load i32, i32 addrspace(1)* %in.gep.2
292 %w = load i32, i32 addrspace(1)* %in.gep.3
293
294 store i32 %x, i32 addrspace(1)* %out
295 store i32 %y, i32 addrspace(1)* %out.gep.1
296 store i32 %z, i32 addrspace(1)* %out.gep.2
297 store i32 %w, i32 addrspace(1)* %out.gep.3
298 ret void
299}
300
301; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
302; SI-DAG: buffer_load_dwordx2
303; SI-DAG: buffer_load_dword v
304; GCN: s_waitcnt
305; SI-DAG: buffer_store_dword v
306; SI-DAG: buffer_store_dwordx2 v
307; GCN: s_endpgm
308define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
309 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
310 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
311 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
312 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
313
314 %x = load i32, i32 addrspace(1)* %in
315 %y = load i32, i32 addrspace(1)* %in.gep.1
316 %z = load i32, i32 addrspace(1)* %in.gep.2
317
318 store i32 %x, i32 addrspace(1)* %out
319 store i32 %y, i32 addrspace(1)* %out.gep.1
320 store i32 %z, i32 addrspace(1)* %out.gep.2
321 ret void
322}
323
324; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
325; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
326; GCN: buffer_store_dwordx4 [[LOAD]]
327define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
328 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
329 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
330 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
331 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
332 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
333 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
334
335 %x = load float, float addrspace(1)* %in
336 %y = load float, float addrspace(1)* %in.gep.1
337 %z = load float, float addrspace(1)* %in.gep.2
338 %w = load float, float addrspace(1)* %in.gep.3
339
340 store float %x, float addrspace(1)* %out
341 store float %y, float addrspace(1)* %out.gep.1
342 store float %z, float addrspace(1)* %out.gep.2
343 store float %w, float addrspace(1)* %out.gep.3
344 ret void
345}
346
347; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
348; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
349; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
350define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
351 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
352 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
353 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
354 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
355 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
356 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
357 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
358 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
359
360 %x = load i32, i32 addrspace(1)* %in.gep.0
361 %y = load i32, i32 addrspace(1)* %in.gep.1
362 %z = load i32, i32 addrspace(1)* %in.gep.2
363 %w = load i32, i32 addrspace(1)* %in.gep.3
364
365 store i32 %x, i32 addrspace(1)* %out.gep.0
366 store i32 %y, i32 addrspace(1)* %out.gep.1
367 store i32 %z, i32 addrspace(1)* %out.gep.2
368 store i32 %w, i32 addrspace(1)* %out.gep.3
369 ret void
370}
371
372; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
373; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
374; GCN: s_barrier
375; GCN: buffer_store_dwordx4 [[LOAD]]
376define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
377 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
378 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
379 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
380 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
381 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
382 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
383
384 %x = load i32, i32 addrspace(1)* %in
385 %y = load i32, i32 addrspace(1)* %in.gep.1
386 %z = load i32, i32 addrspace(1)* %in.gep.2
387 %w = load i32, i32 addrspace(1)* %in.gep.3
388
389 ; Make sure the barrier doesn't stop this
390 tail call void @llvm.AMDGPU.barrier.local() #1
391
392 store i32 %w, i32 addrspace(1)* %out.gep.3
393 store i32 %z, i32 addrspace(1)* %out.gep.2
394 store i32 %y, i32 addrspace(1)* %out.gep.1
395 store i32 %x, i32 addrspace(1)* %out
396
397 ret void
398}
399
400; TODO: Re-packing of loaded register required. Maybe an IR pass
401; should catch this?
402
403; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
404; GCN: buffer_load_dword v
405; GCN: buffer_load_dword v
406; GCN: buffer_load_dword v
407; GCN: buffer_load_dword v
408; GCN: s_barrier
409; GCN: buffer_store_dword v
410; GCN: buffer_store_dword v
411; GCN: buffer_store_dword v
412; GCN: buffer_store_dword v
413define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
414 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
415 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
416 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
417 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
418 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
419 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
420
421 %x = load i32, i32 addrspace(1)* %in
422 %y = load i32, i32 addrspace(1)* %in.gep.1
423 %z = load i32, i32 addrspace(1)* %in.gep.2
424 %w = load i32, i32 addrspace(1)* %in.gep.3
425
426 ; Make sure the barrier doesn't stop this
427 tail call void @llvm.AMDGPU.barrier.local() #1
428
429 store i32 %w, i32 addrspace(1)* %out
430 store i32 %z, i32 addrspace(1)* %out.gep.1
431 store i32 %y, i32 addrspace(1)* %out.gep.2
432 store i32 %x, i32 addrspace(1)* %out.gep.3
433
434 ret void
435}
436
437; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
438; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
439; GCN: buffer_store_dword [[LOAD]]
440; GCN: s_endpgm
441define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
442 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
443 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
444 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
445 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
446 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
447 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
448
449 %x = load i8, i8 addrspace(1)* %in, align 4
450 %y = load i8, i8 addrspace(1)* %in.gep.1
451 %z = load i8, i8 addrspace(1)* %in.gep.2
452 %w = load i8, i8 addrspace(1)* %in.gep.3
453
454 store i8 %x, i8 addrspace(1)* %out, align 4
455 store i8 %y, i8 addrspace(1)* %out.gep.1
456 store i8 %z, i8 addrspace(1)* %out.gep.2
457 store i8 %w, i8 addrspace(1)* %out.gep.3
458 ret void
459}
460
461; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
462; GCN: buffer_load_ubyte
463; GCN: buffer_load_ubyte
464; GCN: buffer_load_ubyte
465; GCN: buffer_load_ubyte
466; GCN: buffer_store_byte
467; GCN: buffer_store_byte
468; GCN: buffer_store_byte
469; GCN: buffer_store_byte
470; GCN: s_endpgm
471define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
472 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
473 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
474 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
475 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
476 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
477 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
478
479 %x = load i8, i8 addrspace(1)* %in
480 %y = load i8, i8 addrspace(1)* %in.gep.1
481 %z = load i8, i8 addrspace(1)* %in.gep.2
482 %w = load i8, i8 addrspace(1)* %in.gep.3
483
484 store i8 %x, i8 addrspace(1)* %out
485 store i8 %y, i8 addrspace(1)* %out.gep.1
486 store i8 %z, i8 addrspace(1)* %out.gep.2
487 store i8 %w, i8 addrspace(1)* %out.gep.3
488 ret void
489}
490
491; This works once AA is enabled on the subtarget
492; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
493; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
Matt Arsenaulte5d95152015-10-13 00:49:00 +0000494
495; GCN-NOAA: buffer_store_dword v
496; GCN-NOAA: buffer_store_dword v
497; GCN-NOAA: buffer_store_dword v
498; GCN-NOAA: buffer_store_dword v
499
500; GCN-AA: buffer_store_dwordx4 [[LOAD]]
501
502; GCN: s_endpgm
Matt Arsenault65ad1602015-05-24 00:51:27 +0000503define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
504 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
505 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
506 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
507 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
508
509 %x = extractelement <4 x i32> %vec, i32 0
510 %y = extractelement <4 x i32> %vec, i32 1
511 %z = extractelement <4 x i32> %vec, i32 2
512 %w = extractelement <4 x i32> %vec, i32 3
513
514 store i32 %x, i32 addrspace(1)* %out
515 store i32 %y, i32 addrspace(1)* %out.gep.1
516 store i32 %z, i32 addrspace(1)* %out.gep.2
517 store i32 %w, i32 addrspace(1)* %out.gep.3
518 ret void
519}
520
521; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
522; GCN: ds_write_b8
523; GCN: ds_write_b8
524; GCN: s_endpgm
525define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
526 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
527
528 store i8 123, i8 addrspace(3)* %out.gep.1
529 store i8 456, i8 addrspace(3)* %out, align 2
530 ret void
531}
532
533; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
Tom Stellard9a197672015-09-09 15:43:26 +0000534; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
535; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
Matt Arsenault65ad1602015-05-24 00:51:27 +0000536; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
537define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
538 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
539
540 store i32 123, i32 addrspace(3)* %out.gep.1
541 store i32 456, i32 addrspace(3)* %out
542 ret void
543}
544
545; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
546; GCN: ds_write_b32
547; GCN: ds_write_b32
548; GCN: ds_write_b32
549; GCN: ds_write_b32
550define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
551 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
552 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
553 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
554
555 store i32 123, i32 addrspace(3)* %out.gep.1
556 store i32 456, i32 addrspace(3)* %out.gep.2
557 store i32 333, i32 addrspace(3)* %out.gep.3
558 store i32 1234, i32 addrspace(3)* %out
559 ret void
560}
561
Matt Arsenaulted891b52015-06-16 15:51:48 +0000562; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
563; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
564; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
565; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
566; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
567; GCN: buffer_store_dword v[[HI]]
568define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
569 store i32 9, i32 addrspace(1)* %out, align 4
570 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
571 store i32 12, i32 addrspace(1)* %idx1, align 4
572 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
573 store i32 16, i32 addrspace(1)* %idx2, align 4
574 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
575 store i32 -12, i32 addrspace(1)* %idx3, align 4
576 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
577 store i32 11, i32 addrspace(1)* %idx4, align 4
578 ret void
579}
580
581; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
582; GCN: buffer_store_dwordx4
583; GCN: buffer_store_dwordx2
584define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
585 store i32 13, i32 addrspace(1)* %out, align 4
586 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
587 store i32 15, i32 addrspace(1)* %idx1, align 4
588 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
589 store i32 62, i32 addrspace(1)* %idx2, align 4
590 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
591 store i32 63, i32 addrspace(1)* %idx3, align 4
592 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
593 store i32 11, i32 addrspace(1)* %idx4, align 4
594 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
595 store i32 123, i32 addrspace(1)* %idx5, align 4
596 ret void
597}
598
599; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
600; GCN: buffer_store_dwordx4
601; GCN: buffer_store_dwordx2
602; GCN: buffer_store_dword v
603define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
604 store i32 34, i32 addrspace(1)* %out, align 4
605 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
606 store i32 999, i32 addrspace(1)* %idx1, align 4
607 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
608 store i32 65, i32 addrspace(1)* %idx2, align 4
609 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
610 store i32 33, i32 addrspace(1)* %idx3, align 4
611 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
612 store i32 98, i32 addrspace(1)* %idx4, align 4
613 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
614 store i32 91, i32 addrspace(1)* %idx5, align 4
615 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
616 store i32 212, i32 addrspace(1)* %idx6, align 4
617 ret void
618}
619
Matt Arsenaulte5d95152015-10-13 00:49:00 +0000620; FIXME: This should do 2 dwordx4 loads
Matt Arsenaulted891b52015-06-16 15:51:48 +0000621; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
Matt Arsenaulted891b52015-06-16 15:51:48 +0000622
Matt Arsenaulte5d95152015-10-13 00:49:00 +0000623; GCN-NOAA: buffer_store_dword v
624; GCN-NOAA: buffer_store_dword v
625; GCN-NOAA: buffer_store_dword v
626; GCN-NOAA: buffer_store_dword v
627; GCN-NOAA: buffer_store_dword v
628; GCN-NOAA: buffer_store_dword v
629; GCN-NOAA: buffer_store_dword v
630; GCN-NOAA: buffer_store_dword v
631
632; GCN-AA: buffer_store_dwordx4
633; GCN-AA: buffer_store_dwordx2
634; GCN-AA: buffer_store_dwordx2
635
636; GCN: s_endpgm
Matt Arsenaulted891b52015-06-16 15:51:48 +0000637define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
638 store i32 34, i32 addrspace(1)* %out, align 4
639 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
640 store i32 999, i32 addrspace(1)* %idx1, align 4
641 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
642 store i32 65, i32 addrspace(1)* %idx2, align 4
643 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
644 store i32 33, i32 addrspace(1)* %idx3, align 4
645 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
646 store i32 98, i32 addrspace(1)* %idx4, align 4
647 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
648 store i32 91, i32 addrspace(1)* %idx5, align 4
649 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
650 store i32 212, i32 addrspace(1)* %idx6, align 4
651 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
652 store i32 999, i32 addrspace(1)* %idx7, align 4
653 ret void
654}
655
Matt Arsenault65ad1602015-05-24 00:51:27 +0000656declare void @llvm.AMDGPU.barrier.local() #1
657
658attributes #0 = { nounwind }
659attributes #1 = { noduplicate nounwind }