blob: a9fc318ce0e110fcc7acffcc9214ed9dc1e43b2c [file] [log] [blame]
Farhana Aleence095c52018-12-14 21:13:14 +00001; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3
4declare i64 @_Z13get_global_idj(i32)
5
6define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) {
7; GCN-LABEL: clmem_read_simplified:
8; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
9; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
10; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
11; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
12; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
13; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
14; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
15; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
16;
17; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
18; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
19; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
20; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
21; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
22; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
23; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
24; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
25entry:
26 %call = tail call i64 @_Z13get_global_idj(i32 0)
27 %conv = and i64 %call, 255
28 %a0 = shl i64 %call, 7
29 %idx.ext11 = and i64 %a0, 4294934528
30 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
31 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
32
33 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
34 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
35 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
36 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
37 %add.1 = add i64 %load2, %load1
38
39 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
40 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
41 %add.2 = add i64 %load3, %add.1
42 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
43 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
44 %add.3 = add i64 %load4, %add.2
45
46 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
47 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
48 %add.4 = add i64 %load5, %add.3
49 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
50 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
51 %add.5 = add i64 %load6, %add.4
52
53 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
54 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
55 %add.6 = add i64 %load7, %add.5
56 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
57 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
58 %add.7 = add i64 %load8, %add.6
59
60 store i64 %add.7, i64 addrspace(1)* %saddr, align 8
61 ret void
62}
63
64define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
65; GCN-LABEL: clmem_read:
66; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
67; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
68; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
69; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
70; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
71; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
72; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
73; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
74; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
75; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
76; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
77;
78; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
79; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
80; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
81; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
82; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
83; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
84; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
85; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
86; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
87; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
88; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
89entry:
90 %call = tail call i64 @_Z13get_global_idj(i32 0)
91 %conv = and i64 %call, 255
92 %a0 = shl i64 %call, 17
93 %idx.ext11 = and i64 %a0, 4261412864
94 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
95 %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
96 %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
97 br label %for.cond.preheader
98
99while.cond.loopexit: ; preds = %for.body
100 %dec = add nsw i32 %dec31, -1
101 %tobool = icmp eq i32 %dec31, 0
102 br i1 %tobool, label %while.end, label %for.cond.preheader
103
104for.cond.preheader: ; preds = %entry, %while.cond.loopexit
105 %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
106 %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
107 br label %for.body
108
109for.body: ; preds = %for.body, %for.cond.preheader
110 %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
111 %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
112 %conv3 = zext i32 %block.029 to i64
113 %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
114 %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
115 %add = add i64 %load1, %sum.128
116
117 %add9 = or i32 %block.029, 256
118 %conv3.1 = zext i32 %add9 to i64
119 %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
120 %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
121 %add.1 = add i64 %load2, %add
122
123 %add9.1 = or i32 %block.029, 512
124 %conv3.2 = zext i32 %add9.1 to i64
125 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
126 %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
127 %add.2 = add i64 %l3, %add.1
128
129 %add9.2 = or i32 %block.029, 768
130 %conv3.3 = zext i32 %add9.2 to i64
131 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
132 %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
133 %add.3 = add i64 %l4, %add.2
134
135 %add9.3 = or i32 %block.029, 1024
136 %conv3.4 = zext i32 %add9.3 to i64
137 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
138 %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
139 %add.4 = add i64 %l5, %add.3
140
141 %add9.4 = or i32 %block.029, 1280
142 %conv3.5 = zext i32 %add9.4 to i64
143 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
144 %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
145 %add.5 = add i64 %l6, %add.4
146
147 %add9.5 = or i32 %block.029, 1536
148 %conv3.6 = zext i32 %add9.5 to i64
149 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
150 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
151 %add.6 = add i64 %load7, %add.5
152
153 %add9.6 = or i32 %block.029, 1792
154 %conv3.7 = zext i32 %add9.6 to i64
155 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
156 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
157 %add.7 = add i64 %load8, %add.6
158
159 %add9.7 = or i32 %block.029, 2048
160 %conv3.8 = zext i32 %add9.7 to i64
161 %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
162 %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
163 %add.8 = add i64 %load9, %add.7
164
165 %add9.8 = or i32 %block.029, 2304
166 %conv3.9 = zext i32 %add9.8 to i64
167 %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
168 %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
169 %add.9 = add i64 %load10, %add.8
170
171 %add9.9 = or i32 %block.029, 2560
172 %conv3.10 = zext i32 %add9.9 to i64
173 %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
174 %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
175 %add.10 = add i64 %load11, %add.9
176
177 %add9.31 = add nuw nsw i32 %block.029, 8192
178 %cmp.31 = icmp ult i32 %add9.31, 4194304
179 br i1 %cmp.31, label %for.body, label %while.cond.loopexit
180
181while.end: ; preds = %while.cond.loopexit
182 store i64 %add.10, i64 addrspace(1)* %a1, align 8
183 ret void
184}
185
186; using 32bit address.
187define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
188; GCN-LABEL: Address32:
189; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
190; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
191; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
192; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
193; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
194; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
195; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
196; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
197; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
198; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
199;
200; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
201; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
202; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
203; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
204; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
205; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096
206; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072
207; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
208; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
209; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
210entry:
211 %call = tail call i64 @_Z13get_global_idj(i32 0)
212 %conv = and i64 %call, 255
213 %id = shl i64 %call, 7
214 %idx.ext11 = and i64 %id, 4294934528
215 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
216 %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
217
218 %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
219 %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
220
221 %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
222 %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
223 %add.1 = add i32 %load2, %load1
224
225 %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
226 %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
227 %add.2 = add i32 %load3, %add.1
228
229 %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
230 %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
231 %add.3 = add i32 %load4, %add.2
232
233 %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
234 %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
235 %add.4 = add i32 %load5, %add.3
236
237 %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
238 %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
239 %add.5 = add i32 %load6, %add.4
240
241 %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
242 %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
243 %add.6 = add i32 %load7, %add.5
244
245 %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
246 %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
247 %add.7 = add i32 %load8, %add.6
248
249 %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
250 %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
251 %add.8 = add i32 %load9, %add.7
252
253 %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
254 %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
255 %add.9 = add i32 %load10, %add.8
256
257 store i32 %add.9, i32 addrspace(1)* %addr, align 4
258 ret void
259}
260
261define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
262; GCN-LABEL: Offset64:
263; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
264; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
265; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
266; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
267;
268; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
269; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
270; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
271; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
272entry:
273 %call = tail call i64 @_Z13get_global_idj(i32 0)
274 %conv = and i64 %call, 255
275 %a0 = shl i64 %call, 7
276 %idx.ext11 = and i64 %a0, 4294934528
277 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
278 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
279
280 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
281 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
282
283 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
284 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
285
286 %add1 = add i64 %load2, %load1
287
288 %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
289 %load3 = load i64, i64 addrspace(1)* %addr3, align 8
290
291 %add2 = add i64 %load3, %add1
292
293 %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
294 %load4 = load i64, i64 addrspace(1)* %addr4, align 8
295 %add4 = add i64 %load4, %add2
296
297 store i64 %add4, i64 addrspace(1)* %saddr, align 8
298 ret void
299}
300
301; TODO: Support load4 as anchor instruction.
302define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) {
303; GCN-LABEL: p32Offset64:
304; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
305; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
306; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
307; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
308;
309; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
310; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
311; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
312; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
313entry:
314 %call = tail call i64 @_Z13get_global_idj(i32 0)
315 %conv = and i64 %call, 255
316 %a0 = shl i64 %call, 7
317 %idx.ext11 = and i64 %a0, 4294934528
318 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
319 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
320
321 %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
322 %load1 = load i32, i32 addrspace(1)* %addr1, align 8
323
324 %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
325 %load2 = load i32, i32 addrspace(1)* %addr2, align 8
326
327 %add1 = add i32 %load2, %load1
328
329 %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
330 %load3 = load i32, i32 addrspace(1)* %addr3, align 8
331
332 %add2 = add i32 %load3, %add1
333
334 %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
335 %load4 = load i32, i32 addrspace(1)* %addr4, align 8
336 %add4 = add i32 %load4, %add2
337
338 store i32 %add4, i32 addrspace(1)* %saddr, align 8
339 ret void
340}
341
342define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
343; GCN-LABEL: DiffBase:
344; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
345; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
346; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
347; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
348; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
349; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
350;
351; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
352; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
353; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
354; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
355; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
356; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
357 i8 addrspace(1)* %buffer2) {
358entry:
359 %call = tail call i64 @_Z13get_global_idj(i32 0)
360 %conv = and i64 %call, 255
361 %a0 = shl i64 %call, 7
362 %idx.ext11 = and i64 %a0, 4294934528
363 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
364 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
365
366 %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
367 %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
368
369 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
370 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
371 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
372 %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
373 %add1 = add i64 %load2, %load1
374 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
375 %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
376 %add2 = add i64 %load3, %add1
377
378 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
379 %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
380
381 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
382 %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
383 %add3 = add i64 %load5, %load4
384
385 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
386 %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
387 %add4 = add i64 %load6, %add3
388
389 %add5 = add i64 %add2, %add4
390
391 store i64 %add5, i64 addrspace(1)* %saddr, align 8
392 ret void
393}
394
395define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
396; GCN-LABEL: ReverseOrder:
397; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
398; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
399; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
400; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
401; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
402; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
403; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
404; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
405;
406; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
407; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
408; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
409; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
410; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
411; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
412; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
413; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
414entry:
415 %call = tail call i64 @_Z13get_global_idj(i32 0)
416 %conv = and i64 %call, 255
417 %a0 = shl i64 %call, 7
418 %idx.ext11 = and i64 %a0, 4294934528
419 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
420 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
421
422 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
423 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
424
425 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
426 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
427 %add7 = add i64 %load8, %load1
428
429 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
430 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
431 %add6 = add i64 %load7, %add7
432
433 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
434 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
435 %add5 = add i64 %load6, %add6
436
437 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
438 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
439 %add4 = add i64 %load5, %add5
440
441 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
442 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
443 %add3 = add i64 %load4, %add4
444
445 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
446 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
447 %add2 = add i64 %load3, %add3
448
449 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
450 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
451 %add1 = add i64 %load2, %add2
452
453 store i64 %add1, i64 addrspace(1)* %saddr, align 8
454 ret void
455}
456
457define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
458; GCN-LABEL: negativeoffset:
459; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
460; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
461;
462; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
463; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
464entry:
465 %call = tail call i64 @_Z13get_global_idj(i32 0) #2
466 %conv = and i64 %call, 255
467 %0 = shl i64 %call, 7
468 %idx.ext11 = and i64 %0, 4294934528
469 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
470 %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
471
472 %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
473
474 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
475 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
476
477 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
478 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
479
480
481 %add = add i64 %load2, %load1
482
483 store i64 %add, i64 addrspace(1)* %buffer_head, align 8
484 ret void
485}