blob: e2a103ab86de2ee6fc1c382ace71ce71b8e9f713 [file] [log] [blame]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001// Test target codegen - host bc file has to be created first.
2// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
4// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
5// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
6// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
7// expected-no-diagnostics
8#ifndef HEADER
9#define HEADER
10
Alexey Bataevf2f39be2018-11-16 19:38:21 +000011// CHECK: [[MAP_TY:%.+]] = type { [16 x i8] }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000012
Alexey Bataevf2f39be2018-11-16 19:38:21 +000013// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
14// CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
15// CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
16// CHECK-DAG: [[KERNEL_SHARED3:@.+]] = internal unnamed_addr constant i16 1
17// CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
18// CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
19// CHECK-DAG: [[KERNEL_SIZE3:@.+]] = internal unnamed_addr constant i{{64|32}} 8
20
21// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
22// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
23
24// Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD.
25// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 1
26// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 1
27// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000028
29template<typename tx>
30tx ftemplate(int n) {
31 int a;
32 short b;
33 tx c;
34 float d;
35 double e;
36
37 #pragma omp target
38 #pragma omp teams reduction(+: e)
39 {
40 e += 5;
41 }
42
43 #pragma omp target
44 #pragma omp teams reduction(^: c) reduction(*: d)
45 {
46 c ^= 2;
47 d *= 33;
48 }
49
50 #pragma omp target
51 #pragma omp teams reduction(|: a) reduction(max: b)
Alexey Bataevf2f39be2018-11-16 19:38:21 +000052 #pragma omp parallel reduction(|: a) reduction(max: b)
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000053 {
54 a |= 1;
55 b = 99 > b ? 99 : b;
56 }
57
58 return a+b+c+d+e;
59}
60
61int bar(int n){
62 int a = 0;
63
64 a += ftemplate<char>(n);
65
66 return a;
67}
68
Alexey Bataevf2f39be2018-11-16 19:38:21 +000069 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l37}}_worker()
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000070
Alexey Bataevf2f39be2018-11-16 19:38:21 +000071 // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l37]](
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000072 //
73 // CHECK: {{call|invoke}} void [[T1]]_worker()
74 //
75 // CHECK: call void @__kmpc_kernel_init(
76 //
77 // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
78 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
79 // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
80 // CHECK: store double [[ADD]], double* [[E]], align
81 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i[[SZ:32|64]] 0, i{{32|64}} 0
82 // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
83 // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
84 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
85 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 1, i[[SZ]] {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
86 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
87 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
88 //
89 // CHECK: [[IFLABEL]]
90 // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
91 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
92 // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
93 // CHECK: store double [[ADD]], double* [[E_IN]], align
94 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
95 // CHECK: br label %[[EXIT]]
96 //
97 // CHECK: [[EXIT]]
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000098 // CHECK: call void @__kmpc_kernel_deinit(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000099
100 //
101 // Reduction function
102 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
103 // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
104 // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
105 // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
106 //
107 // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
108 // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
109 // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
110 //
111 // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
112 // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
113 // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
114 // CHECK: store double [[RES]], double* [[VAR_LHS]],
115 // CHECK: ret void
116
117 //
118 // Shuffle and reduce function
119 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
120 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
121 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
122 //
123 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
124 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
125 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
126 //
127 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
128 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
129 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
130 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000131 //
Alexey Bataev12c62902018-06-22 19:10:38 +0000132 // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
133 // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
134 // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000135 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
136 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
Alexey Bataev12c62902018-06-22 19:10:38 +0000137 // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000138 //
Alexey Bataev12c62902018-06-22 19:10:38 +0000139 // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000140 // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
141 // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
142 //
143 // Condition to reduce
144 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
145 //
146 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
147 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
148 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
149 //
150 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
151 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
152 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
153 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
154 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
155 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
156 //
157 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
158 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
159 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
160 //
161 // CHECK: [[DO_REDUCE]]
162 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
163 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
164 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
165 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
166 //
167 // CHECK: [[REDUCE_ELSE]]
168 // CHECK: br label {{%?}}[[REDUCE_CONT]]
169 //
170 // CHECK: [[REDUCE_CONT]]
171 // Now check if we should just copy over the remote reduction list
172 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
173 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
174 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
175 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
176 //
177 // CHECK: [[DO_COPY]]
178 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
179 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
180 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
181 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000182 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
Alexey Bataevb2575932018-01-04 20:18:55 +0000183 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
Alexey Bataev12c62902018-06-22 19:10:38 +0000184 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000185 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
186 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
187 //
188 // CHECK: [[COPY_ELSE]]
189 // CHECK: br label {{%?}}[[COPY_CONT]]
190 //
191 // CHECK: [[COPY_CONT]]
192 // CHECK: void
193
194 //
195 // Inter warp copy function
196 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
197 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
198 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
199 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000200 // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
201 // CHECK: br label
202 // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
203 // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
204 // CHECK: br i1 [[DONE_COPY]], label
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000205 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
206 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
207 //
208 // [[DO_COPY]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000209 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000210 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000211 // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
212 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000213 //
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000214 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
215 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
216 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000217 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
218 //
219 // CHECK: [[COPY_ELSE]]
220 // CHECK: br label {{%?}}[[COPY_CONT]]
221 //
222 // Barrier after copy to shared memory storage medium.
223 // CHECK: [[COPY_CONT]]
224 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
225 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
226 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
227 //
228 // Read into warp 0.
229 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
230 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
231 //
232 // CHECK: [[DO_READ]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000233 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
234 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000235 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000236 // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
237 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
238 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
239 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000240 // CHECK: br label {{%?}}[[READ_CONT:.+]]
241 //
242 // CHECK: [[READ_ELSE]]
243 // CHECK: br label {{%?}}[[READ_CONT]]
244 //
245 // CHECK: [[READ_CONT]]
246 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000247 // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
248 // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
249 // CHECK: br label
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000250 // CHECK: ret
251
252 //
253 // Copy to scratchpad function
254 // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32)
255 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
256 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
257 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
258 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
259 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
260 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
261 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
262 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
263 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
264 //
265 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
266 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
267 //
Alexey Bataeve290ec02018-04-06 16:03:36 +0000268 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 8, [[TEAM]]
269 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000270 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000271 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
Alexey Bataevb2575932018-01-04 20:18:55 +0000272 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double*
Alexey Bataev12c62902018-06-22 19:10:38 +0000273 // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000274 // CHECK: store double [[ELT_VAL]], double* [[SCRATCHPAD_ELT_PTR]], align
275 //
276 // CHECK: ret
277
278 //
279 // Load and reduce function
280 // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32)
281 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
282 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
283 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
284 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
285 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
286 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
287 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
288 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
289 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
290 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
291 // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
292 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
293 //
Alexey Bataeve290ec02018-04-06 16:03:36 +0000294 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 8, [[TEAM]]
295 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000296 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
297
298 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
299 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double*
300 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[SCRATCHPAD_ELT_PTR]], align
301 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align
302 // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
303 // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align
304 //
Alexey Bataev9ff80832018-04-16 20:16:21 +0000305 // CHECK: [[REDUCE:%.+]] = icmp ne i32 [[SHOULD_REDUCE]], 0
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000306 // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
307 //
308 // CHECK: [[DO_REDUCE]]
309 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
310 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
311 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
312 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
313 //
314 // Copy element from remote reduce list
315 // CHECK: [[REDUCE_ELSE]]
316 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
317 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
318 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
319 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000320 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
Alexey Bataevb2575932018-01-04 20:18:55 +0000321 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
Alexey Bataev12c62902018-06-22 19:10:38 +0000322 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000323 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
324 // CHECK: br label {{%?}}[[REDUCE_CONT]]
325 //
326 // CHECK: [[REDUCE_CONT]]
327 // CHECK: ret
328
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000329 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000330
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000331 // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l43]](
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000332 //
333 // CHECK: {{call|invoke}} void [[T2]]_worker()
334 //
335 // CHECK: call void @__kmpc_kernel_init(
336 //
337 // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
338 // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
339 // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
340 // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
341 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
342 // CHECK: store i8 [[TRUNC]], i8* [[C]], align
343 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
344 // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
345 // CHECK: store float [[MUL]], float* [[D]], align
346 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0
347 // CHECK: store i8* [[C]], i8** [[PTR1]], align
348 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
349 // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
350 // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
351 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
352 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
353 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
354 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
355 //
356 // CHECK: [[IFLABEL]]
357 // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
358 // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
359 // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
360 // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
361 // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
362 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
363 // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
364 // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
365 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
366 // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
367 // CHECK: store float [[MUL]], float* [[D_IN]], align
368 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
369 // CHECK: br label %[[EXIT]]
370 //
371 // CHECK: [[EXIT]]
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000372 // CHECK: call void @__kmpc_kernel_deinit(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000373
374 //
375 // Reduction function
376 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
377 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
378 // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
379 //
380 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
381 // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
382 //
383 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
384 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
385 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
386 //
387 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
388 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
389 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
390 //
391 // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
392 // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
393 // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
394 // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
395 // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
396 // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
397 // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
398 //
399 // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
400 // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
401 // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
402 // CHECK: store float [[RES]], float* [[VAR2_LHS]],
403 // CHECK: ret void
404
405 //
406 // Shuffle and reduce function
407 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
408 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
409 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
410 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
411 //
412 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
413 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
414 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
415 //
416 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
417 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
418 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
419 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
420 //
421 // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
422 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
423 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
424 // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
425 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
426 //
427 // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
428 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
429 //
430 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
431 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
432 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
433 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000434 //
Alexey Bataev12c62902018-06-22 19:10:38 +0000435 // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
436 // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
437 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000438 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
439 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
Alexey Bataev12c62902018-06-22 19:10:38 +0000440 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000441 //
Alexey Bataev12c62902018-06-22 19:10:38 +0000442 // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000443 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
444 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
445 //
446 // Condition to reduce
447 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
448 //
449 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
450 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
451 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
452 //
453 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
454 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
455 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
456 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
457 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
458 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
459 //
460 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
461 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
462 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
463 //
464 // CHECK: [[DO_REDUCE]]
465 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
466 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
467 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
468 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
469 //
470 // CHECK: [[REDUCE_ELSE]]
471 // CHECK: br label {{%?}}[[REDUCE_CONT]]
472 //
473 // CHECK: [[REDUCE_CONT]]
474 // Now check if we should just copy over the remote reduction list
475 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
476 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
477 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
478 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
479 //
480 // CHECK: [[DO_COPY]]
481 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
482 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
483 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
484 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
485 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
486 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
487 //
488 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
489 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
490 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
491 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000492 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
Alexey Bataevb2575932018-01-04 20:18:55 +0000493 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
Alexey Bataev12c62902018-06-22 19:10:38 +0000494 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000495 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
496 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
497 //
498 // CHECK: [[COPY_ELSE]]
499 // CHECK: br label {{%?}}[[COPY_CONT]]
500 //
501 // CHECK: [[COPY_CONT]]
502 // CHECK: void
503
504 //
505 // Inter warp copy function
506 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
507 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
508 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
509 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
510 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
511 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
512 //
513 // [[DO_COPY]]
514 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
515 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000516 //
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000517 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
518 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
Alexey Bataev12c62902018-06-22 19:10:38 +0000519 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000520 // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000521 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
522 //
523 // CHECK: [[COPY_ELSE]]
524 // CHECK: br label {{%?}}[[COPY_CONT]]
525 //
526 // Barrier after copy to shared memory storage medium.
527 // CHECK: [[COPY_CONT]]
528 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
529 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
530 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
531 //
532 // Read into warp 0.
533 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
534 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
535 //
536 // CHECK: [[DO_READ]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000537 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
538 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000539 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
540 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000541 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000542 // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
543 // CHECK: br label {{%?}}[[READ_CONT:.+]]
544 //
545 // CHECK: [[READ_ELSE]]
546 // CHECK: br label {{%?}}[[READ_CONT]]
547 //
548 // CHECK: [[READ_CONT]]
549 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
550 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
551 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
552 //
553 // [[DO_COPY]]
554 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
555 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000556 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000557 //
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000558 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
559 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
560 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000561 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
562 //
563 // CHECK: [[COPY_ELSE]]
564 // CHECK: br label {{%?}}[[COPY_CONT]]
565 //
566 // Barrier after copy to shared memory storage medium.
567 // CHECK: [[COPY_CONT]]
568 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
569 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
570 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
571 //
572 // Read into warp 0.
573 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
574 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
575 //
576 // CHECK: [[DO_READ]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000577 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000578 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
579 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000580 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
581 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
582 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000583 // CHECK: br label {{%?}}[[READ_CONT:.+]]
584 //
585 // CHECK: [[READ_ELSE]]
586 // CHECK: br label {{%?}}[[READ_CONT]]
587 //
588 // CHECK: [[READ_CONT]]
589 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
590 // CHECK: ret
591
592 //
593 // Copy to scratchpad function
594 // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32)
595 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
596 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
597 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
598 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
599 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
600 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
601 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
602 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
603 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
604 //
605 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
606 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
607 //
Alexey Bataeve290ec02018-04-06 16:03:36 +0000608 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 1, [[TEAM]]
609 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000610 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
611 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
612 // CHECK: store i8 [[ELT_VAL]], i8* [[SCRATCHPAD_ELT_PTR]], align
613 //
Alexey Bataeve290ec02018-04-06 16:03:36 +0000614 // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 1
615 // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]]
616 // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000617 // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128
Alexey Bataeve290ec02018-04-06 16:03:36 +0000618 // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000619 // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000620 //
621 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
622 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
623 //
Alexey Bataeve290ec02018-04-06 16:03:36 +0000624 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]]
625 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000626 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000627 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
Alexey Bataevb2575932018-01-04 20:18:55 +0000628 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float*
Alexey Bataev12c62902018-06-22 19:10:38 +0000629 // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000630 // CHECK: store float [[ELT_VAL]], float* [[SCRATCHPAD_ELT_PTR]], align
631 //
632 // CHECK: ret
633
634 //
635 // Load and reduce function
636 // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32)
637 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
638 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
639 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
640 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
641 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
642 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
643 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
644 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
645 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
646 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
647 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
648 // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
649 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
650 //
Alexey Bataeve290ec02018-04-06 16:03:36 +0000651 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 1, [[TEAM]]
652 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000653 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
654
655 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
656 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[SCRATCHPAD_ELT_PTR_VOID]], align
657 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[REMOTE_ELT1]], align
658 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
659 //
Alexey Bataeve290ec02018-04-06 16:03:36 +0000660 // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 1
661 // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]]
662 // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000663 // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128
Alexey Bataeve290ec02018-04-06 16:03:36 +0000664 // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000665 // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000666 //
Alexey Bataeve290ec02018-04-06 16:03:36 +0000667 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]]
668 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000669 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
670
671 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
672 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float*
673 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[SCRATCHPAD_ELT_PTR]], align
674 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[REMOTE_ELT2]], align
675 // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
676 // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align
677 //
Alexey Bataev9ff80832018-04-16 20:16:21 +0000678 // CHECK: [[REDUCE:%.+]] = icmp ne i32 [[SHOULD_REDUCE]], 0
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000679 // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
680 //
681 // CHECK: [[DO_REDUCE]]
682 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
683 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
684 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
685 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
686 //
687 // Copy element from remote reduce list
688 // CHECK: [[REDUCE_ELSE]]
689 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
690 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
691 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
692 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
693 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
694 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
695 //
696 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
697 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
698 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
699 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000700 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
Alexey Bataevb2575932018-01-04 20:18:55 +0000701 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
Alexey Bataev12c62902018-06-22 19:10:38 +0000702 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000703 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
704 // CHECK: br label {{%?}}[[REDUCE_CONT]]
705 //
706 // CHECK: [[REDUCE_CONT]]
707 // CHECK: ret
708
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000709 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}(
710 //
711 // CHECK: call void @__kmpc_spmd_kernel_init(
712 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
713 // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**))
714 // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
715 // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]*
716 // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0
717 // CHECK-DAG: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 1
718 // CHECK: store i32 0, i32* [[A_ADDR]],
719 // CHECK: store i16 -32768, i16* [[B_ADDR]],
720 // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]])
721 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0
722 // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A_ADDR]] to i8*
723 // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
724 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
725 // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B_ADDR]] to i8*
726 // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
727 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
728 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
729 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
730 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
731 //
732 // CHECK: [[IFLABEL]]
733 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
734 // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align
735 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
736 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
737 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
738 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
739 // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align
740 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
741 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
742 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
743 //
744 // CHECK: [[DO_MAX]]
745 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
746 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
747 //
748 // CHECK: [[MAX_ELSE]]
749 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align
750 // CHECK: br label {{%?}}[[MAX_CONT]]
751 //
752 // CHECK: [[MAX_CONT]]
753 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
754 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
755 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
756 // CHECK: br label %[[EXIT]]
757 //
758 // CHECK: [[EXIT]]
759 // call void @__kmpc_restore_team_static_memory(i16 1)
760 // CHECK: call void @__kmpc_spmd_kernel_deinit(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000761
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000762 // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}})
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000763 //
764 // CHECK: store i32 0, i32* [[A:%.+]], align
765 // CHECK: store i16 -32768, i16* [[B:%.+]], align
766 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
767 // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
768 // CHECK: store i32 [[OR]], i32* [[A]], align
769 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
770 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
771 // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
772 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
773 //
774 // CHECK: [[DO_MAX]]
775 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
776 //
777 // CHECK: [[MAX_ELSE]]
778 // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
779 // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
780 // CHECK: br label {{%?}}[[MAX_CONT]]
781 //
782 // CHECK: [[MAX_CONT]]
783 // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
784 // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
785 // CHECK: store i16 [[TRUNC]], i16* [[B]], align
786 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0
787 // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
788 // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
789 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
790 // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
791 // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
792 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000793 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]])
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000794 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
795 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
796 //
797 // CHECK: [[IFLABEL]]
798 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
799 // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
800 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
801 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
802 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
803 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
804 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
805 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
806 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
807 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
808 //
809 // CHECK: [[DO_MAX]]
810 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
811 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
812 //
813 // CHECK: [[MAX_ELSE]]
814 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
815 // CHECK: br label {{%?}}[[MAX_CONT]]
816 //
817 // CHECK: [[MAX_CONT]]
818 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
819 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
820 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
821 // CHECK: br label %[[EXIT]]
822 //
823 // CHECK: [[EXIT]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000824 // CHECK: ret void
825
826 //
827 // Reduction function
828 // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*)
829 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
830 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
831 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
832 //
833 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
834 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
835 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
836 //
837 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
838 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
839 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
840 //
841 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
842 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
843 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
844 //
845 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
846 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
847 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
848 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
849 //
850 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
851 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
852 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
853 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
854 //
855 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
856 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
857 //
858 // CHECK: [[DO_MAX]]
859 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
860 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
861 //
862 // CHECK: [[MAX_ELSE]]
863 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
864 // CHECK: br label {{%?}}[[MAX_CONT]]
865 //
866 // CHECK: [[MAX_CONT]]
867 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
868 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
869 // CHECK: ret void
870
871 //
872 // Shuffle and reduce function
873 // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
874 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
875 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
876 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
877 //
878 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
879 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
880 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
881 //
882 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
883 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
884 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
885 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
886 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
887 //
888 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
889 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
890 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
891 //
892 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
893 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
894 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
895 //
896 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
897 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
898 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
899 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
900 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
901 //
902 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
903 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
904 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
905 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
906 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
907 //
908 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
909 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
910 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
911 //
912 // Condition to reduce
913 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
914 //
915 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
916 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
917 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
918 //
919 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
920 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
921 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
922 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
923 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
924 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
925 //
926 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
927 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
928 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
929 //
930 // CHECK: [[DO_REDUCE]]
931 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
932 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
933 // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
934 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
935 //
936 // CHECK: [[REDUCE_ELSE]]
937 // CHECK: br label {{%?}}[[REDUCE_CONT]]
938 //
939 // CHECK: [[REDUCE_CONT]]
940 // Now check if we should just copy over the remote reduction list
941 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
942 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
943 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
944 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
945 //
946 // CHECK: [[DO_COPY]]
947 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
948 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
949 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
950 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
951 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
952 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
953 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
954 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
955 //
956 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
957 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
958 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
959 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
960 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
961 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
962 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
963 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
964 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
965 //
966 // CHECK: [[COPY_ELSE]]
967 // CHECK: br label {{%?}}[[COPY_CONT]]
968 //
969 // CHECK: [[COPY_CONT]]
970 // CHECK: void
971
972 //
973 // Inter warp copy function
974 // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32)
975 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
976 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
977 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
978 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
979 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
980 //
981 // [[DO_COPY]]
982 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
983 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
984 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
985 //
986 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
987 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
988 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
989 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
990 //
991 // CHECK: [[COPY_ELSE]]
992 // CHECK: br label {{%?}}[[COPY_CONT]]
993 //
994 // Barrier after copy to shared memory storage medium.
995 // CHECK: [[COPY_CONT]]
996 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
997 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
998 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
999 //
1000 // Read into warp 0.
1001 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1002 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1003 //
1004 // CHECK: [[DO_READ]]
1005 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1006 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1007 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1008 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1009 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1010 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
1011 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1012 //
1013 // CHECK: [[READ_ELSE]]
1014 // CHECK: br label {{%?}}[[READ_CONT]]
1015 //
1016 // CHECK: [[READ_CONT]]
1017 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
1018 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1019 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1020 //
1021 // [[DO_COPY]]
1022 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1023 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1024 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1025 //
1026 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
1027 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1028 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1029 // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1030 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1031 //
1032 // CHECK: [[COPY_ELSE]]
1033 // CHECK: br label {{%?}}[[COPY_CONT]]
1034 //
1035 // Barrier after copy to shared memory storage medium.
1036 // CHECK: [[COPY_CONT]]
1037 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1038 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
1039 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
1040 //
1041 // Read into warp 0.
1042 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1043 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1044 //
1045 // CHECK: [[DO_READ]]
1046 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1047 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1048 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1049 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1050 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1051 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1052 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
1053 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1054 //
1055 // CHECK: [[READ_ELSE]]
1056 // CHECK: br label {{%?}}[[READ_CONT]]
1057 //
1058 // CHECK: [[READ_CONT]]
1059 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
1060 // CHECK: ret
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001061
1062 //
1063 // Reduction function
1064 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
1065 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
1066 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
1067 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
1068 //
1069 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
1070 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
1071 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
1072 //
1073 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
1074 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
1075 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
1076 //
1077 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
1078 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
1079 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
1080 //
1081 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
1082 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
1083 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
1084 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
1085 //
1086 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
1087 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
1088 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
1089 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
1090 //
1091 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
1092 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
1093 //
1094 // CHECK: [[DO_MAX]]
1095 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
1096 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
1097 //
1098 // CHECK: [[MAX_ELSE]]
1099 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
1100 // CHECK: br label {{%?}}[[MAX_CONT]]
1101 //
1102 // CHECK: [[MAX_CONT]]
1103 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
1104 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
1105 // CHECK: ret void
1106
1107 //
1108 // Shuffle and reduce function
1109 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
1110 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
1111 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
1112 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
1113 //
1114 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
1115 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
1116 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
1117 //
1118 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1119 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1120 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1121 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1122 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
1123 //
1124 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1125 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
1126 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
1127 //
1128 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
1129 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
1130 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
1131 //
1132 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1133 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1134 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1135 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1136 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1137 //
1138 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
1139 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1140 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
1141 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
1142 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
1143 //
1144 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
1145 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
1146 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
1147 //
1148 // Condition to reduce
1149 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
1150 //
1151 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
1152 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
1153 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
1154 //
1155 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
1156 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
1157 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
1158 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
1159 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
1160 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
1161 //
1162 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
1163 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
1164 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
1165 //
1166 // CHECK: [[DO_REDUCE]]
1167 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
1168 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
1169 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
1170 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
1171 //
1172 // CHECK: [[REDUCE_ELSE]]
1173 // CHECK: br label {{%?}}[[REDUCE_CONT]]
1174 //
1175 // CHECK: [[REDUCE_CONT]]
1176 // Now check if we should just copy over the remote reduction list
1177 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
1178 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
1179 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
1180 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1181 //
1182 // CHECK: [[DO_COPY]]
1183 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1184 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1185 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1186 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001187 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
Alexey Bataevb2575932018-01-04 20:18:55 +00001188 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
Alexey Bataev12c62902018-06-22 19:10:38 +00001189 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001190 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
1191 //
1192 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1193 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1194 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1195 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001196 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
Alexey Bataevb2575932018-01-04 20:18:55 +00001197 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
Alexey Bataev12c62902018-06-22 19:10:38 +00001198 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001199 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
1200 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1201 //
1202 // CHECK: [[COPY_ELSE]]
1203 // CHECK: br label {{%?}}[[COPY_CONT]]
1204 //
1205 // CHECK: [[COPY_CONT]]
1206 // CHECK: void
1207
1208 //
1209 // Inter warp copy function
1210 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
1211 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
1212 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
1213 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
1214 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1215 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1216 //
1217 // [[DO_COPY]]
1218 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1219 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1220 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001221 //
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001222 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
Alexey Bataev12c62902018-06-22 19:10:38 +00001223 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001224 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001225 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1226 //
1227 // CHECK: [[COPY_ELSE]]
1228 // CHECK: br label {{%?}}[[COPY_CONT]]
1229 //
1230 // Barrier after copy to shared memory storage medium.
1231 // CHECK: [[COPY_CONT]]
1232 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1233 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
1234 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
1235 //
1236 // Read into warp 0.
1237 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1238 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1239 //
1240 // CHECK: [[DO_READ]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001241 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001242 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1243 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1244 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001245 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001246 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
1247 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1248 //
1249 // CHECK: [[READ_ELSE]]
1250 // CHECK: br label {{%?}}[[READ_CONT]]
1251 //
1252 // CHECK: [[READ_CONT]]
1253 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
1254 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1255 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1256 //
1257 // [[DO_COPY]]
1258 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1259 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1260 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001261 //
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001262 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
1263 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
Alexey Bataev12c62902018-06-22 19:10:38 +00001264 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001265 // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001266 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1267 //
1268 // CHECK: [[COPY_ELSE]]
1269 // CHECK: br label {{%?}}[[COPY_CONT]]
1270 //
1271 // Barrier after copy to shared memory storage medium.
1272 // CHECK: [[COPY_CONT]]
1273 // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1274 // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
1275 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
1276 //
1277 // Read into warp 0.
1278 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1279 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1280 //
1281 // CHECK: [[DO_READ]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001282 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1283 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001284 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1285 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1286 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001287 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001288 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
1289 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1290 //
1291 // CHECK: [[READ_ELSE]]
1292 // CHECK: br label {{%?}}[[READ_CONT]]
1293 //
1294 // CHECK: [[READ_CONT]]
1295 // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
1296 // CHECK: ret
1297
1298 //
1299 // Copy to scratchpad function
1300 // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32)
1301 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
1302 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
1303 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
1304 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
1305 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
1306 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
1307 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
1308 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
1309 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
1310 //
1311 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1312 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1313 //
Alexey Bataeve290ec02018-04-06 16:03:36 +00001314 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]]
1315 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001316 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001317 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
Alexey Bataevb2575932018-01-04 20:18:55 +00001318 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32*
Alexey Bataev12c62902018-06-22 19:10:38 +00001319 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001320 // CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align
1321 //
Alexey Bataeve290ec02018-04-06 16:03:36 +00001322 // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4
1323 // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]]
1324 // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001325 // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128
Alexey Bataeve290ec02018-04-06 16:03:36 +00001326 // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001327 // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001328 //
1329 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1330 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1331 //
Alexey Bataeve290ec02018-04-06 16:03:36 +00001332 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 2, [[TEAM]]
1333 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001334 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001335 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
Alexey Bataevb2575932018-01-04 20:18:55 +00001336 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16*
Alexey Bataev12c62902018-06-22 19:10:38 +00001337 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001338 // CHECK: store i16 [[ELT_VAL]], i16* [[SCRATCHPAD_ELT_PTR]], align
1339 //
1340 // CHECK: ret
1341
1342 //
1343 // Load and reduce function
1344 // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32)
1345 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
1346 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
1347 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
1348 // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
1349 // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align
1350 // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align
1351 // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64
1352 // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align
1353 // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align
1354 // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64
1355 // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align
1356 // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
1357 // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
1358 //
Alexey Bataeve290ec02018-04-06 16:03:36 +00001359 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]]
1360 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001361 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
1362
1363 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1364 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32*
1365 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[SCRATCHPAD_ELT_PTR]], align
1366 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[REMOTE_ELT1]], align
1367 // CHECK: [[REMOTE_ELT1_PTR:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
1368 // CHECK: store i8* [[REMOTE_ELT1_PTR]], i8** [[REMOTE_ELT_REF]], align
1369 //
Alexey Bataeve290ec02018-04-06 16:03:36 +00001370 // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4
1371 // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]]
1372 // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001373 // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128
Alexey Bataeve290ec02018-04-06 16:03:36 +00001374 // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001375 // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001376 //
Alexey Bataeve290ec02018-04-06 16:03:36 +00001377 // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 2, [[TEAM]]
1378 // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001379 // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
1380
1381 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1382 // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16*
1383 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[SCRATCHPAD_ELT_PTR]], align
1384 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[REMOTE_ELT2]], align
1385 // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
1386 // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align
1387 //
Alexey Bataev9ff80832018-04-16 20:16:21 +00001388 // CHECK: [[REDUCE:%.+]] = icmp ne i32 [[SHOULD_REDUCE]], 0
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001389 // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
1390 //
1391 // CHECK: [[DO_REDUCE]]
1392 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
1393 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
1394 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
1395 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
1396 //
1397 // Copy element from remote reduce list
1398 // CHECK: [[REDUCE_ELSE]]
1399 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1400 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1401 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1402 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001403 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
Alexey Bataevb2575932018-01-04 20:18:55 +00001404 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
Alexey Bataev12c62902018-06-22 19:10:38 +00001405 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001406 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
1407 //
1408 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1409 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1410 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1411 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001412 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
Alexey Bataevb2575932018-01-04 20:18:55 +00001413 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
Alexey Bataev12c62902018-06-22 19:10:38 +00001414 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001415 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
1416 // CHECK: br label {{%?}}[[REDUCE_CONT]]
1417 //
1418 // CHECK: [[REDUCE_CONT]]
1419 // CHECK: ret
1420
1421
1422#endif