blob: ae129ebfae4d4f271d11c9755d9313eb37b28d06 [file] [log] [blame]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001// Test target codegen - host bc file has to be created first.
2// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
4// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
5// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
6// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
7// expected-no-diagnostics
8#ifndef HEADER
9#define HEADER
10
11// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
12// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
13
14// Check that the execution mode of all 3 target regions is set to Generic Mode.
15// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1
16// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1
17// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1
18
19template<typename tx>
20tx ftemplate(int n) {
21 int a;
22 short b;
23 tx c;
24 float d;
25 double e;
26
27 #pragma omp target
28 #pragma omp teams reduction(+: e)
29 {
30 e += 5;
31 }
32
33 #pragma omp target
34 #pragma omp teams reduction(^: c) reduction(*: d)
35 {
36 c ^= 2;
37 d *= 33;
38 }
39
40 #pragma omp target
41 #pragma omp teams reduction(|: a) reduction(max: b)
42 {
43 a |= 1;
44 b = 99 > b ? 99 : b;
45 }
46
47 return a+b+c+d+e;
48}
49
50int bar(int n){
51 int a = 0;
52
53 a += ftemplate<char>(n);
54
55 return a;
56}
57
58 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker()
59
60 // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]](
61 //
62 // CHECK: {{call|invoke}} void [[T1]]_worker()
63 //
64 // CHECK: call void @__kmpc_kernel_init(
65 //
66 // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
67 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
68 // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
69 // CHECK: store double [[ADD]], double* [[E]], align
70 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i[[SZ:32|64]] 0, i{{32|64}} 0
71 // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
72 // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
73 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
74 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 1, i[[SZ]] {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
75 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
76 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
77 //
78 // CHECK: [[IFLABEL]]
79 // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
80 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
81 // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
82 // CHECK: store double [[ADD]], double* [[E_IN]], align
83 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
84 // CHECK: br label %[[EXIT]]
85 //
86 // CHECK: [[EXIT]]
87 // CHECK: call void @__kmpc_kernel_deinit()
88
89 //
90 // Reduction function
91 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
92 // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
93 // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
94 // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
95 //
96 // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
97 // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
98 // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
99 //
100 // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
101 // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
102 // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
103 // CHECK: store double [[RES]], double* [[VAR_LHS]],
104 // CHECK: ret void
105
106 //
107 // Shuffle and reduce function
108 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
109 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
110 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
111 //
112 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
113 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
114 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
115 //
116 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
117 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
118 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
119 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
120 // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
121 //
122 // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64
123 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
124 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
125 // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
126 // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double
127 //
128 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align
129 // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
130 // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
131 //
132 // Condition to reduce
133 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
134 //
135 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
136 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
137 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
138 //
139 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
140 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
141 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
142 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
143 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
144 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
145 //
146 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
147 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
148 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
149 //
150 // CHECK: [[DO_REDUCE]]
151 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
152 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
153 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
154 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
155 //
156 // CHECK: [[REDUCE_ELSE]]
157 // CHECK: br label {{%?}}[[REDUCE_CONT]]
158 //
159 // CHECK: [[REDUCE_CONT]]
160 // Now check if we should just copy over the remote reduction list
161 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
162 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
163 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
164 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
165 //
166 // CHECK: [[DO_COPY]]
167 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
168 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
169 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
170 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
171 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
172 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
173 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
174 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
175 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
176 //
177 // CHECK: [[COPY_ELSE]]
178 // CHECK: br label {{%?}}[[COPY_CONT]]
179 //
180 // CHECK: [[COPY_CONT]]
181 // CHECK: void
182
183 //
184 // Inter warp copy function
185 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
186 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
187 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
188 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
189 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
190 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
191 //
192 // [[DO_COPY]]
193 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
194 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
195 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
196 // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
197 //
198 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
199 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
200 // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
201 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
202 //
203 // CHECK: [[COPY_ELSE]]
204 // CHECK: br label {{%?}}[[COPY_CONT]]
205 //
206 // Barrier after copy to shared memory storage medium.
207 // CHECK: [[COPY_CONT]]
208 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
209 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
210 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
211 //
212 // Read into warp 0.
213 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
214 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
215 //
216 // CHECK: [[DO_READ]]
217 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
218 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
219 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
220 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
221 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
222 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
223 // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
224 // CHECK: br label {{%?}}[[READ_CONT:.+]]
225 //
226 // CHECK: [[READ_ELSE]]
227 // CHECK: br label {{%?}}[[READ_CONT]]
228 //
229 // CHECK: [[READ_CONT]]
230 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
231 // CHECK: ret
232
233 //
234 // Copy to scratchpad function
235 // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32)
236 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
237 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
238 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
239 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
240 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
241 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
242 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
243 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
244 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
245 //
246 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
247 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
248 //
249 // CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]]
250 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
251 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
252 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double*
253 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
254 // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
255 // CHECK: store double [[ELT_VAL]], double* [[SCRATCHPAD_ELT_PTR]], align
256 //
257 // CHECK: ret
258
259 //
260 // Load and reduce function
261 // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32)
262 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
263 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
264 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
265 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
266 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
267 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
268 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
269 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
270 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
271 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
272 // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
273 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
274 //
275 // CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]]
276 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
277 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
278
279 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
280 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double*
281 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[SCRATCHPAD_ELT_PTR]], align
282 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align
283 // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
284 // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align
285 //
286 // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1
287 // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
288 //
289 // CHECK: [[DO_REDUCE]]
290 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
291 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
292 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
293 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
294 //
295 // Copy element from remote reduce list
296 // CHECK: [[REDUCE_ELSE]]
297 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
298 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
299 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
300 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
301 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
302 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
303 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
304 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
305 // CHECK: br label {{%?}}[[REDUCE_CONT]]
306 //
307 // CHECK: [[REDUCE_CONT]]
308 // CHECK: ret
309
310
311
312
313
314
315
316
317
318
319
320 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker()
321
322 // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]](
323 //
324 // CHECK: {{call|invoke}} void [[T2]]_worker()
325 //
326 // CHECK: call void @__kmpc_kernel_init(
327 //
328 // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
329 // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
330 // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
331 // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
332 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
333 // CHECK: store i8 [[TRUNC]], i8* [[C]], align
334 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
335 // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
336 // CHECK: store float [[MUL]], float* [[D]], align
337 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0
338 // CHECK: store i8* [[C]], i8** [[PTR1]], align
339 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
340 // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
341 // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
342 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
343 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
344 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
345 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
346 //
347 // CHECK: [[IFLABEL]]
348 // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
349 // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
350 // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
351 // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
352 // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
353 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
354 // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
355 // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
356 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
357 // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
358 // CHECK: store float [[MUL]], float* [[D_IN]], align
359 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
360 // CHECK: br label %[[EXIT]]
361 //
362 // CHECK: [[EXIT]]
363 // CHECK: call void @__kmpc_kernel_deinit()
364
365 //
366 // Reduction function
367 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
368 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
369 // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
370 //
371 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
372 // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
373 //
374 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
375 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
376 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
377 //
378 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
379 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
380 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
381 //
382 // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
383 // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
384 // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
385 // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
386 // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
387 // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
388 // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
389 //
390 // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
391 // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
392 // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
393 // CHECK: store float [[RES]], float* [[VAR2_LHS]],
394 // CHECK: ret void
395
396 //
397 // Shuffle and reduce function
398 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
399 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
400 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
401 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
402 //
403 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
404 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
405 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
406 //
407 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
408 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
409 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
410 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
411 //
412 // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
413 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
414 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
415 // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
416 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
417 //
418 // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
419 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
420 //
421 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
422 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
423 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
424 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
425 // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
426 //
427 // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32
428 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
429 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
430 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
431 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float
432 //
433 // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align
434 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
435 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
436 //
437 // Condition to reduce
438 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
439 //
440 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
441 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
442 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
443 //
444 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
445 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
446 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
447 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
448 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
449 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
450 //
451 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
452 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
453 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
454 //
455 // CHECK: [[DO_REDUCE]]
456 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
457 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
458 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
459 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
460 //
461 // CHECK: [[REDUCE_ELSE]]
462 // CHECK: br label {{%?}}[[REDUCE_CONT]]
463 //
464 // CHECK: [[REDUCE_CONT]]
465 // Now check if we should just copy over the remote reduction list
466 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
467 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
468 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
469 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
470 //
471 // CHECK: [[DO_COPY]]
472 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
473 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
474 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
475 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
476 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
477 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
478 //
479 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
480 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
481 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
482 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
483 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
484 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
485 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
486 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
487 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
488 //
489 // CHECK: [[COPY_ELSE]]
490 // CHECK: br label {{%?}}[[COPY_CONT]]
491 //
492 // CHECK: [[COPY_CONT]]
493 // CHECK: void
494
495 //
496 // Inter warp copy function
497 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
498 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
499 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
500 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
501 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
502 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
503 //
504 // [[DO_COPY]]
505 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
506 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
507 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
508 //
509 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
510 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
511 // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
512 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
513 //
514 // CHECK: [[COPY_ELSE]]
515 // CHECK: br label {{%?}}[[COPY_CONT]]
516 //
517 // Barrier after copy to shared memory storage medium.
518 // CHECK: [[COPY_CONT]]
519 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
520 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
521 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
522 //
523 // Read into warp 0.
524 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
525 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
526 //
527 // CHECK: [[DO_READ]]
528 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
529 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
530 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
531 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
532 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
533 // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
534 // CHECK: br label {{%?}}[[READ_CONT:.+]]
535 //
536 // CHECK: [[READ_ELSE]]
537 // CHECK: br label {{%?}}[[READ_CONT]]
538 //
539 // CHECK: [[READ_CONT]]
540 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
541 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
542 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
543 //
544 // [[DO_COPY]]
545 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
546 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
547 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
548 // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
549 //
550 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
551 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
552 // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
553 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
554 //
555 // CHECK: [[COPY_ELSE]]
556 // CHECK: br label {{%?}}[[COPY_CONT]]
557 //
558 // Barrier after copy to shared memory storage medium.
559 // CHECK: [[COPY_CONT]]
560 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
561 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
562 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
563 //
564 // Read into warp 0.
565 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
566 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
567 //
568 // CHECK: [[DO_READ]]
569 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
570 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
571 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
572 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
573 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
574 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
575 // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
576 // CHECK: br label {{%?}}[[READ_CONT:.+]]
577 //
578 // CHECK: [[READ_ELSE]]
579 // CHECK: br label {{%?}}[[READ_CONT]]
580 //
581 // CHECK: [[READ_CONT]]
582 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
583 // CHECK: ret
584
585 //
586 // Copy to scratchpad function
587 // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32)
588 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
589 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
590 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
591 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
592 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
593 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
594 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
595 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
596 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
597 //
598 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
599 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
600 //
601 // CHECK: [[P:%.+]] = mul i[[SZ]] 1, [[TEAM]]
602 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
603 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
604 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
605 // CHECK: store i8 [[ELT_VAL]], i8* [[SCRATCHPAD_ELT_PTR]], align
606 //
607 // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 1
608 // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]]
609 // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1
610 // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256
611 // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1
612 // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256
613 //
614 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
615 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
616 //
617 // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
618 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
619 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
620 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float*
621 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
622 // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
623 // CHECK: store float [[ELT_VAL]], float* [[SCRATCHPAD_ELT_PTR]], align
624 //
625 // CHECK: ret
626
627 //
628 // Load and reduce function
629 // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32)
630 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
631 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
632 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
633 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
634 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
635 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
636 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
637 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
638 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
639 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
640 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
641 // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
642 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
643 //
644 // CHECK: [[P:%.+]] = mul i[[SZ]] 1, [[TEAM]]
645 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
646 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
647
648 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
649 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[SCRATCHPAD_ELT_PTR_VOID]], align
650 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[REMOTE_ELT1]], align
651 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
652 //
653 // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 1
654 // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]]
655 // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1
656 // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256
657 // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1
658 // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256
659 //
660 // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
661 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
662 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
663
664 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
665 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float*
666 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[SCRATCHPAD_ELT_PTR]], align
667 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[REMOTE_ELT2]], align
668 // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
669 // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align
670 //
671 // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1
672 // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
673 //
674 // CHECK: [[DO_REDUCE]]
675 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
676 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
677 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
678 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
679 //
680 // Copy element from remote reduce list
681 // CHECK: [[REDUCE_ELSE]]
682 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
683 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
684 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
685 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
686 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
687 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
688 //
689 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
690 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
691 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
692 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
693 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
694 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
695 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
696 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
697 // CHECK: br label {{%?}}[[REDUCE_CONT]]
698 //
699 // CHECK: [[REDUCE_CONT]]
700 // CHECK: ret
701
702
703
704
705
706
707
708
709
710
711 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker()
712
713 // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]](
714 //
715 // CHECK: {{call|invoke}} void [[T3]]_worker()
716 //
717 // CHECK: call void @__kmpc_kernel_init(
718 //
719 // CHECK: store i32 0, i32* [[A:%.+]], align
720 // CHECK: store i16 -32768, i16* [[B:%.+]], align
721 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
722 // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
723 // CHECK: store i32 [[OR]], i32* [[A]], align
724 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
725 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
726 // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
727 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
728 //
729 // CHECK: [[DO_MAX]]
730 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
731 //
732 // CHECK: [[MAX_ELSE]]
733 // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
734 // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
735 // CHECK: br label {{%?}}[[MAX_CONT]]
736 //
737 // CHECK: [[MAX_CONT]]
738 // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
739 // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
740 // CHECK: store i16 [[TRUNC]], i16* [[B]], align
741 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0
742 // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
743 // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
744 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
745 // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
746 // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
747 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
748 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
749 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
750 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
751 //
752 // CHECK: [[IFLABEL]]
753 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
754 // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
755 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
756 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
757 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
758 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
759 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
760 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
761 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
762 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
763 //
764 // CHECK: [[DO_MAX]]
765 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
766 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
767 //
768 // CHECK: [[MAX_ELSE]]
769 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
770 // CHECK: br label {{%?}}[[MAX_CONT]]
771 //
772 // CHECK: [[MAX_CONT]]
773 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
774 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
775 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
776 // CHECK: br label %[[EXIT]]
777 //
778 // CHECK: [[EXIT]]
779 // CHECK: call void @__kmpc_kernel_deinit()
780
781 //
782 // Reduction function
783 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
784 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
785 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
786 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
787 //
788 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
789 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
790 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
791 //
792 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
793 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
794 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
795 //
796 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
797 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
798 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
799 //
800 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
801 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
802 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
803 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
804 //
805 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
806 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
807 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
808 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
809 //
810 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
811 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
812 //
813 // CHECK: [[DO_MAX]]
814 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
815 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
816 //
817 // CHECK: [[MAX_ELSE]]
818 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
819 // CHECK: br label {{%?}}[[MAX_CONT]]
820 //
821 // CHECK: [[MAX_CONT]]
822 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
823 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
824 // CHECK: ret void
825
826 //
827 // Shuffle and reduce function
828 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
829 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
830 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
831 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
832 //
833 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
834 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
835 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
836 //
837 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
838 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
839 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
840 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
841 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
842 //
843 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
844 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
845 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
846 //
847 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
848 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
849 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
850 //
851 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
852 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
853 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
854 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
855 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
856 //
857 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
858 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
859 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
860 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
861 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
862 //
863 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
864 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
865 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
866 //
867 // Condition to reduce
868 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
869 //
870 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
871 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
872 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
873 //
874 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
875 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
876 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
877 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
878 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
879 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
880 //
881 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
882 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
883 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
884 //
885 // CHECK: [[DO_REDUCE]]
886 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
887 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
888 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
889 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
890 //
891 // CHECK: [[REDUCE_ELSE]]
892 // CHECK: br label {{%?}}[[REDUCE_CONT]]
893 //
894 // CHECK: [[REDUCE_CONT]]
895 // Now check if we should just copy over the remote reduction list
896 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
897 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
898 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
899 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
900 //
901 // CHECK: [[DO_COPY]]
902 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
903 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
904 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
905 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
906 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
907 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
908 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
909 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
910 //
911 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
912 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
913 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
914 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
915 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
916 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
917 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
918 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
919 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
920 //
921 // CHECK: [[COPY_ELSE]]
922 // CHECK: br label {{%?}}[[COPY_CONT]]
923 //
924 // CHECK: [[COPY_CONT]]
925 // CHECK: void
926
927 //
928 // Inter warp copy function
929 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
930 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
931 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
932 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
933 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
934 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
935 //
936 // [[DO_COPY]]
937 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
938 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
939 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
940 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
941 //
942 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
943 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
944 // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
945 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
946 //
947 // CHECK: [[COPY_ELSE]]
948 // CHECK: br label {{%?}}[[COPY_CONT]]
949 //
950 // Barrier after copy to shared memory storage medium.
951 // CHECK: [[COPY_CONT]]
952 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
953 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
954 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
955 //
956 // Read into warp 0.
957 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
958 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
959 //
960 // CHECK: [[DO_READ]]
961 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
962 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
963 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
964 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
965 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
966 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
967 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
968 // CHECK: br label {{%?}}[[READ_CONT:.+]]
969 //
970 // CHECK: [[READ_ELSE]]
971 // CHECK: br label {{%?}}[[READ_CONT]]
972 //
973 // CHECK: [[READ_CONT]]
974 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
975 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
976 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
977 //
978 // [[DO_COPY]]
979 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
980 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
981 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
982 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
983 //
984 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
985 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
986 // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
987 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
988 //
989 // CHECK: [[COPY_ELSE]]
990 // CHECK: br label {{%?}}[[COPY_CONT]]
991 //
992 // Barrier after copy to shared memory storage medium.
993 // CHECK: [[COPY_CONT]]
994 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
995 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
996 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
997 //
998 // Read into warp 0.
999 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1000 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1001 //
1002 // CHECK: [[DO_READ]]
1003 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1004 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1005 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1006 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1007 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1008 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1009 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
1010 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1011 //
1012 // CHECK: [[READ_ELSE]]
1013 // CHECK: br label {{%?}}[[READ_CONT]]
1014 //
1015 // CHECK: [[READ_CONT]]
1016 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
1017 // CHECK: ret
1018
1019 //
1020 // Copy to scratchpad function
1021 // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32)
1022 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
1023 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
1024 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
1025 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
1026 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
1027 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
1028 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
1029 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
1030 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
1031 //
1032 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1033 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1034 //
1035 // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
1036 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
1037 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
1038 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32*
1039 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1040 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
1041 // CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align
1042 //
1043 // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4
1044 // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]]
1045 // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1
1046 // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256
1047 // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1
1048 // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256
1049 //
1050 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1051 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1052 //
1053 // CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]]
1054 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
1055 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
1056 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16*
1057 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1058 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1059 // CHECK: store i16 [[ELT_VAL]], i16* [[SCRATCHPAD_ELT_PTR]], align
1060 //
1061 // CHECK: ret
1062
1063 //
1064 // Load and reduce function
1065 // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32)
1066 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
1067 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
1068 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
1069 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
1070 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
1071 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
1072 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
1073 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
1074 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
1075 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
1076 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
1077 // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
1078 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
1079 //
1080 // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
1081 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
1082 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
1083
1084 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1085 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32*
1086 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[SCRATCHPAD_ELT_PTR]], align
1087 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[REMOTE_ELT1]], align
1088 // CHECK: [[REMOTE_ELT1_PTR:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
1089 // CHECK: store i8* [[REMOTE_ELT1_PTR]], i8** [[REMOTE_ELT_REF]], align
1090 //
1091 // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4
1092 // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]]
1093 // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1
1094 // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256
1095 // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1
1096 // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256
1097 //
1098 // CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]]
1099 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
1100 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
1101
1102 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1103 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16*
1104 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[SCRATCHPAD_ELT_PTR]], align
1105 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[REMOTE_ELT2]], align
1106 // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
1107 // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align
1108 //
1109 // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1
1110 // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
1111 //
1112 // CHECK: [[DO_REDUCE]]
1113 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
1114 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
1115 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
1116 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
1117 //
1118 // Copy element from remote reduce list
1119 // CHECK: [[REDUCE_ELSE]]
1120 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1121 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1122 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1123 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1124 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1125 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
1126 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
1127 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
1128 //
1129 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1130 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1131 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1132 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1133 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1134 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
1135 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
1136 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
1137 // CHECK: br label {{%?}}[[REDUCE_CONT]]
1138 //
1139 // CHECK: [[REDUCE_CONT]]
1140 // CHECK: ret
1141
1142
1143#endif